示例#1
0
class CNLP:
    CNLPServerURL = 'http://localhost:9000'

    def __init__(self):
        self.parser = CoreNLPParser(url=self.CNLPServerURL)
        self.dep_parser = CoreNLPDependencyParser(url=self.CNLPServerURL)
        self.ner_tagger = CoreNLPParser(url=self.CNLPServerURL, tagtype='ner')
        self.pos_tagger = CoreNLPParser(url=self.CNLPServerURL, tagtype='pos')

    def getParse(self, sentence):
        if (type(sentence) == list):
            return self.parser.parse(sentence)
        else:
            return self.parser.raw_parse(sentence)

    def getDepParse(self, sentence):
        if (type(sentence) == list):
            return self.dep_parser.parse(sentence)
        else:
            return self.dep_parser.raw_parse(sentence)

    def getNERTags(self, sentence):
        if (type(sentence) != list):
            sentence = sentence.split()
        return self.ner_tagger.tag(sentence)

    def getPOSTags(self, sentence):
        if (type(sentence) == list):
            return self.pos_tagger.parse(sentence)
        else:
            return self.pos_tagger.raw_parse(sentence)
def create_dataset_bin(annotation_file, data_file):
    parser = CoreNLPParser(url='http://localhost:9080')
    dirname = os.path.dirname(os.path.realpath(__file__)) + "/"

    dataset = []

    with open(annotation_file, "r") as file1, open(data_file, "r") as file2:
        for line_from_file_1, line_from_file_2 in zip(file1, file2):
            output = None
            line1 = line_from_file_1.split()
            line2 = line_from_file_2
            if line1[0] == "ne":
                output = 7
            elif line1[0] == "hp":
                output = 0
            elif line1[0] == "sd":
                output = 1
            elif line1[0] == "ag":
                output = 2
            elif line1[0] == "dg":
                output = 3
            elif line1[0] == "sp":
                output = 4
            elif line1[0] == "fr":
                output = 5
            elif line1[0] == "me":
                output = 6
            dataset.append((output, list(parser.tokenize(line2))))
    print(len(dataset))

    with open(dirname + "Pickle/dataset_ready", 'wb') as outfile:
        cPickle.dump(dataset, outfile)
    def parse_tree(self, s):
        parser = CoreNLPParser()

        parse = next(parser.raw_parse(s))
        # parse.draw()

        return parse
示例#4
0
def annotate(sentence, lower=True):
    global client
   
    nlp = CoreNLPParser('http://localhost:9000')

    res = nlp.api_call(sentence,properties={'annotators': 'tokenize,ssplit'})     

        
    words, gloss, after = [], [], []
    
    print(sentence)
    for t in res['sentences']:
        for i in range(len(t['tokens'])):
            words.append(t['tokens'][i]['word'])
            gloss.append(t['tokens'][i]['originalText'])
            after.append(t['tokens'][i]['after'])
    if lower:
        words = [w.lower() for w in words]
    a={
        'gloss': gloss,
        'words': words,
        'after': after,
        }    
    print(a)        
    return {
        'gloss': gloss,
        'words': words,
        'after': after,
        }
示例#5
0
    def run_nlp(self, language):
        # Make sure server is running properly (as explained in https://github.com/nltk/nltk/wiki/Stanford-CoreNLP-API-in-NLTK) :
        # might need root
        # english: java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -preload tokenize,ssplit,pos,lemma,ner,parse,depparse,sentiment -status_port 9000 -port 9000 -timeout 15000
        # the german implementation cannot do sentiment analysis, the predictions do not bear any relevance, keeping the code like that just makes it easier to maybe add seom sentiment analysis of the parsed german text in the future
        # if the service times out increasing the timeout helps. This usually happens when a sentence is too long to be handled within the given period.
        self.__check_language(language)
        util.time_log("starting NLP...")
        annotator_dict = {"annotators": "sentiment"}
        classifier = CoreNLPParser("http://localhost:9000")

        ret_list = []

        for k_iter in range(0, self.k):
            prediction = []
            for review in self.test_data_text(language, k_iter):
                response_dict = classifier.api_call(review,
                                                    properties=annotator_dict,
                                                    timeout=500)
                count = 0
                sentiment = 0.0
                for sentence in response_dict["sentences"]:
                    count += 1
                    sentiment += float(sentence["sentimentValue"])

                avg_sentiment = sentiment / count
                # a lot better results with >=2
                prediction.append(1 if avg_sentiment >= 2 else 0)
            ret_list.append(prediction)
        return ret_list
示例#6
0
 def _create_parser(url):
     try:
         parser = CoreNLPParser(url=url)
         parser.raw_parse('This is a test sentence.')
     except Exception:
         parser = None
     return parser
示例#7
0
def get_bigram_and_deep_syntax_feature(review, speller, stop_words, ps, preprocess):
    res = ""
    productions = []

    parser = CoreNLPParser(url='http://localhost:9500')

    for sentence in re.split(r"[.!?]", review):
        try:
            tree = next(parser.raw_parse(sentence))

            # Optimize by creating Chomsky normal form
            tree.collapse_unary(collapsePOS=False)
            tree.chomsky_normal_form(horzMarkov=2)
            productions += tree.productions()

        except StopIteration:
            # End of review reached
            break

    S = Nonterminal('S')
    grammar = induce_pcfg(S, productions)

    count = 0
    for line in str(grammar).split("\n"):
        if count == 0:
            count += 1
            continue
        elif "'" in line:
            res += re.sub(r"[(->) `\'\"\[\d\]]", "", line) + " "

    res += bipos.get_bigrams_and_unigrams_of_sentence(
        bow.sanitize_sentence(review, speller, stop_words, ps, preprocess))

    return res
示例#8
0
def start_testing(trained_model_file):
    parser = CoreNLPParser(url='http://localhost:9080')

    emotions = ['happiness', 'sadness', 'anger', 'disgust', 'surprise', 'fear']

    dirname = os.path.dirname(os.path.realpath(__file__)) + "/"

    glove_model = read_glove_vectors(dirname + "Pickle/gloveModel")

    hidden_size = 256
    num_layers = 2
    bidirectional = False
    batchnorm = False
    dropout_hidden = 0.3
    dropout_output = 0.9
    model = LSTM(300, hidden_size, num_layers, bidirectional, batchnorm,
                 dropout_hidden, dropout_output).to(device)

    with torch.no_grad():
        model.load_state_dict(torch.load(trained_model_file))
        print(model)
        model.eval()
        while True:
            test_sentence = input("Give a test sentence: ")
            sentence = list(parser.tokenize(test_sentence))
            input1, sent_length = get_input_vector(glove_model, sentence)
            class_pred = model(input1, sent_length)
            print("Sentence: " + test_sentence)
            _, pred = class_pred.max(dim=1)
            print("Prediction:\t" + emotions[pred[0]])
            print("Output Values:")
            percentages = torch.nn.functional.softmax(class_pred, dim=1) * 100
            for i in range(len(emotions)):
                print(emotions[i] + " %" +
                      str(percentages.data.tolist()[0][i]))
示例#9
0
def convert_eng_to_isl(input_string):

    if len(list(input_string.split(' '))) is 1:
        return list(input_string.split(' '))

    # Initializing stanford parser
    parser = CoreNLPParser()

    # Generates all possible parse trees sort by probability for the sentence
    possible_parse_tree_list = [tree for tree in parser.parse(input_string.split())]

    # Get most probable parse tree
    parse_tree = possible_parse_tree_list[0]
    # print(parse_tree)
    # output = '(ROOT
    #               (S
    #                   (PP (IN As) (NP (DT an) (NN accountant)))
    #                   (NP (PRP I))
    #                   (VP (VBP want) (S (VP (TO to) (VP (VB make) (NP (DT a) (NN payment))))))
    #                )
    #             )'

    # Convert into tree data structure
    parent_tree = ParentedTree.convert(parse_tree)    
    print("\n\nParse Tree:\n")
    print(parent_tree)   

    modified_parse_tree = modify_tree_structure(parent_tree)
    print("\n\nModified Parse Tree:\n")
    print(modified_parse_tree)

    isl_sentence = modified_parse_tree.leaves()
    return isl_sentence
示例#10
0
def getNERs(ws):
    from nltk.parse.corenlp import CoreNLPParser
    from textcrafts.corenlp_api import parserURL
    parser = CoreNLPParser(url=parserURL, tagtype='ner')
    ts = parser.tag(ws)
    for t in ts:
        if t[1] != 'O':
            yield t
 def __init__(self, sentence):
     config = ApplicationConfig.get_corenlp_config()
     self._parser = CoreNLPParser(url=f"http://{config['host']}:{config['port']}")
     self._dependency = CoreNLPDependencyParser(url=f"http://{config['host']}:{config['port']}")
     sentence = sentence.replace('  ', ' ')
     sentence = sentence.replace('.', '')
     self._load(sentence)
     self.original = sentence
示例#12
0
def get_postagger_for_criterion(criterion):
    #ini_path = "/stanford/postagger"
    #os.environ['STANFORD_PARSER'] = ini_path
    #os.environ['STANFORD_MODELS'] = ini_path
    #os.environ['CLASSPATH'] = ini_path
    
    st = CoreNLPParser(url=os.environ['STANFORD_NLP_TOOLS'], tagtype='pos')
    postagger_list = st.tag(criterion)
    return postagger_list
示例#13
0
文件: sense.py 项目: gumigumi4f/sv4d
    def __init__(self):
        self.synset_example = {}
        self.tokenizer = CoreNLPParser(url='http://localhost:42636')

        self.use_babelnet = use_extended_gloss
        if self.use_babelnet:
            from py4j.java_gateway import JavaGateway
            gateway = JavaGateway()
            self.sense = gateway.entry_point
示例#14
0
    def convert_sentence_to_ids(self, sentence: Union[str, list]):
        if not self.parser:
            self.parser = CoreNLPParser(url='http://localhost:9000',
                                        tagtype='pos')

        tags = self.convert_sentence_to_tags(sentence)
        ids = self.convert_tags_to_ids(tags)
        print(type(sentence), len(sentence), len(tags), len(ids))
        return list(ids)
示例#15
0
 def __init__(self, tag_id_initialized=False, tag_id=None, uncased=True):
     self.uncased = uncased
     self.tag_id_initialized = tag_id_initialized
     if tag_id_initialized:
         self.tag_to_id = tag_id
     else:
         self.tag_to_id = {"CLSSEP": 0, "UNKNOWN": 1}
     self.parser = CoreNLPParser(url='http://localhost:9000', tagtype='pos')
     self.basic_tokenizer = BasicTokenizer()
示例#16
0
def build_vocab(json:str, threshold:int, keeppunctuation: bool, host_address:str, character_level:bool=False, zh:bool=True ):
    """Build vocabulary from csv file with a given threshold to drop all counts < threshold

    Args:
        csv (string): Input csv file. Needs to be tab separated and having a column named 'caption'
        
        Modiefied:
        json(string): Input json file. Shoud have a column named 'caption'
        threshold (int): Threshold to drop all words with counts < threshold
        keeppunctuation (bool): Includes or excludes punctuation.

    Returns:
        vocab (Vocab): Object with the processed vocabulary
    """
    #df = pd.read_csv(csv, sep='\t')
    df = pd.read_json(json)
    counter = Counter()
    
    if zh:
        parser = CoreNLPParser(host_address)
        for i in tqdm(range(len(df)), leave=False):
            caption = str(df.loc[i]['caption'])
            # Remove all punctuations
            if not keeppunctuation:
                caption = re.sub("[{}]".format(punctuation),"",caption)
            if character_level:
                tokens = list(caption)
            else:
                tokens = list(parser.tokenize(caption))
            counter.update(tokens)
    else:
        punctuation = ',.()'
        for i in tqdm(range(len(df)), leave=False):
            caption = str(df.loc[i]['caption'])
            # Remove all punctuations
            if not keeppunctuation:
                caption = re.sub("[{}]".format(punctuation),"",caption)
            if character_level:
                tokens = list(caption)
            else:
                tokens = caption.split()
            counter.update(tokens)

    words = [word for word, cnt in counter.items() if cnt >= threshold]

    # Create a vocab wrapper and add some special tokens.
    vocab = Vocabulary()
    vocab.add_word('<pad>')
    vocab.add_word('<start>')
    vocab.add_word('<end>')
    vocab.add_word('<unk>')

    # Add the words to the vocabulary.
    for i, word in enumerate(words):
        vocab.add_word(word)
    return vocab
示例#17
0
    def __init__(self, url='http://localhost:9000', encoding='utf8'):
        """Start the parsers to make sure they're running before calling.

        CoreNLP runs by default on port 9000, but if an external server is used
          or a different port is selected when started, the url will need to be
          explicitly passed.
        """
        self.NERT = CoreNLPNERTagger(url=url)
        self.Parser = CoreNLPParser(url=url, encoding=encoding)
        self.dep_parser = DepParser(url=url)
示例#18
0
 def __init__(self):
     # Annotator dependencies, see https://stanfordnlp.github.io/CoreNLP/dependencies.html
     self.additional_properties = {
         'tokenize.options':
         'ptb3Escaping=false, unicodeQuotes=true, splitHyphenated=true, normalizeParentheses=false, normalizeOtherBrackets=false',
         'annotators': 'tokenize, ssplit, pos, lemma'
     }
     self.stanford_parser = CoreNLPParser()
     # The '-xmx2G' changes the maximum allowable RAM to 2GB instead of the default 512MB.
     internals.config_java(options='-xmx4G')
示例#19
0
class NLTK_NLP():

    def __init__(self, ip_port):
        self.dep_parser = CoreNLPDependencyParser(url=ip_port)
        self.ner_parser = CoreNLPParser(url=ip_port, tagtype='ner')
        self.parser = CoreNLPParser(url=ip_port)
        self.pos_tagger = CoreNLPParser(url=ip_port, tagtype='pos')

    def generate_dependency_tree(self, sentence):
        '''what is the name of the asteroid ?'''
        dependency_tree, = self.dep_parser.raw_parse(sentence=sentence)
        return dependency_tree

    def generate_dependency_graph(self, sentence):
        '''12 {'address': 12, 'word': '.', 'lemma': '.', 'ctag': '.', 'tag': '.', 'feats': '', 'head': 1, 'deps': defaultdict(<class 'list'>, {}), 'rel': 'punct'}
        7-tuple, where the values are ``word, lemma, ctag, tag, feats, head, rel``.'''
        dependency_tree, = self.dep_parser.raw_parse(sentence=sentence)
        return DependencyGraph(dependency_tree.to_conll(10))

    def generate_constituency_tree(self, sentence):
        '''input: one question'''
        tree_list = list(self.parser.raw_parse(sentence=sentence))
        return tree_list[0]

    def get_pos(self, sentence):
        '''What is the airspeed of an unladen swallow ?
        [('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'), 'airspeed', 'NN'), ('of', 'IN'), ('an', 'DT'), ('unladen', 'JJ'), ('swallow', 'VB'), ('?', '.')]
        '''
        pos_list = list(self.pos_tagger.tag(sentence.split()))
        # tokens = nltk.word_tokenize(sentence)
        # wordpos = nltk.pos_tag(tokens)
        return pos_list

    def get_pos_by_tokens(self, tokens):
        '''What is the airspeed of an unladen swallow ?'''
        pos_list = list(self.pos_tagger.tag(tokens))
        return pos_list

    def get_ner(self, sentence):
        # tokens = 'Rami Eid is studying at Stony Brook University in NY'.split()
        '''april the 26th, 1882 is the birth date of which athletes ?
        [('april', 'DATE'), ('the', 'DATE'), ('26th', 'DATE'), (',', 'DATE'), ('1882', 'DATE'),
        ('is', 'O'), ('the', 'O'), ('birth', 'O'), ('date', 'O'), ('of', 'O'), ('which', 'O'),
        ('athletes', 'O'), ('?', 'O')]'''
        sequence_ner_tuple_list = self.ner_parser.tag(sentence.split())
        sequence_ner_list = []
        for i, (word, ner_tag) in enumerate(sequence_ner_tuple_list):
            sequence_ner_list.append(ner_tag)
        return sequence_ner_list

    def get_toknizer(self, sentence):
        return list(self.parser.tokenize(sentence))

    def find_phrases(self, tree, phrase_tag='NP'):
        return [subtree.leaves() for subtree in tree.subtrees(lambda t: t.label()==phrase_tag)]
示例#20
0
class Lex_parser:
    def __init__(self, tag_id_initialized=False, tag_id=None, uncased=True):
        self.uncased = uncased
        self.tag_id_initialized = tag_id_initialized
        if tag_id_initialized:
            self.tag_to_id = tag_id
        else:
            self.tag_to_id = {"CLSSEP": 0, "UNKNOWN": 1}
        self.parser = CoreNLPParser(url='http://localhost:9000', tagtype='pos')
        self.basic_tokenizer = BasicTokenizer()

    def tokenize(self, sentence):
        return list(self.parser.tokenize(sentence))

    def convert_sentence_to_tags(self, sentence: Union[str, list]):
        if type(sentence) == str:
            if self.uncased:
                sentence = sentence.lower()

        else:
            sentence = " ".join(sentence)
            if self.uncased:
                sentence = sentence.lower()

        sentence = self.basic_tokenizer.tokenize(sentence)

        # print("sentence here,", sentence)
        sentence = list(map(lambda x: x.upper() if x == 'i' else x, sentence))
        tags = self.parser.tag(sentence)
        # print("sentence here,", sentence)
        # print("tags here", tags)
        # exit(-2)
        if not self.tag_id_initialized:
            for tag in tags:
                if tag[1] not in self.tag_to_id:
                    self.tag_to_id[tag[1]] = len(self.tag_to_id)
        return tags

    def convert_tags_to_ids(self, tags):
        res = list(map(lambda x: self.tag_to_id[x[1]], tags))
        # print("to ids ==")
        # print(len(tags), tags)
        # print(len(res), res)
        return res

    def convert_sentence_to_ids(self, sentence: Union[str, list]):
        if not self.parser:
            self.parser = CoreNLPParser(url='http://localhost:9000',
                                        tagtype='pos')

        tags = self.convert_sentence_to_tags(sentence)
        ids = self.convert_tags_to_ids(tags)
        print(type(sentence), len(sentence), len(tags), len(ids))
        return list(ids)
示例#21
0
def tokenize_and_write_to_tokenresult(text, dest):
    #https://github.com/nltk/nltk/wiki/Stanford-CoreNLP-API-in-NLTK
    url = 'http://localhost:9000'
    dep_parser = CoreNLPDependencyParser(url=url)
    tokens = CoreNLPParser(url)

    res = tokens.tokenize(text)

    for token in res:
        if token == '.':
            dest.write(token.lower() + '\n')
        else:
            dest.write(token.lower() + ' ')
示例#22
0
def stanford_nlp():
    parser = CoreNLPParser()
    text1 = "There is still a place for mercenaries working for NGOs."
    text2 = "The Rich Poor Gap Silences the Political Voice of the Poor"
    text3 = "Legislation against mercenaries"
    for text in [text1, text2, text3]:
        parse = next(parser.raw_parse(text))
        print(parse)
        has_sent = False
        for item in parse.subtrees():
            if item.label() == "S":
                has_sent = True
        print(has_sent)
示例#23
0
    def __init__(
        self,
        url: str = 'http://localhost:9000',
        encoding: str = 'utf-8',
        start_tokens: List[str] = None,
        end_tokens: List[str] = None,
    ):
        self._parser = CoreNLPParser(url, encoding, 'pos')

        self._start_tokens = start_tokens or []
        # We reverse the tokens here because we're going to insert them with `insert(0)` later;
        # this makes sure they show up in the right order.
        self._start_tokens.reverse()
        self._end_tokens = end_tokens or []
示例#24
0
def convert_text_tree(sentence):
    """ Converts a given sentence into a sentiment treebank like tree.

    :param sentence:
        String that needs to be converted.
    :return:
        String encoding tree structure.
    """
    parser = CoreNLPParser()

    # Parse sentence in nltk tree nodes
    root, = next(parser.raw_parse(sentence))

    # Recursively build text
    return get_node_text(root)
def parse_consituency_tree(sentence_list):

    pos_parent = []
    right_sublings_list = []
    chunk_position = []
    sen = mergeWords(sentence_list)
    parser = CoreNLPParser(url="http://localhost:9000")
    parse, = parser.raw_parse(sen)
    parse.pretty_print()
    newtree = ParentedTree.convert(parse)
    leaf_values = newtree.leaves()
    for i, word in enumerate(sentence_list):
        index = find_closest_words(i, word, leaf_values)
        if index >= 0 and index < len(leaf_values):
            tree_location = newtree.leaf_treeposition(index)
            parent = newtree[tree_location[:-2]].label()
            pos_parent.append(parent)

            #####################find right_sibling###########################
            right_sibling = newtree[tree_location[:-1]].right_sibling()
            #count = calcuate_nodes((right_sibling))
            if parent == "NP" and right_sibling is not None and calcuate_nodes(
                    right_sibling) == 1:
                count = calcuate_nodes((right_sibling))
                #print(count)
                right_sublings_list.append(right_sibling.leaves()[0])
            else:
                right_sublings_list.append(" ")

            ###########################find chunk item position##########################
            height = newtree[tree_location[:-2]].height()
            #只处理最底层的NP tree_height == 3
            if parent == "NP" and height == 3:
                chunk_item_list = newtree[tree_location[:-2]].leaves()
                print(newtree[tree_location[:-2]].height())
                for i, item in enumerate(chunk_item_list):
                    if item == leaf_values[index]:
                        chunk_position.append(i + 1)
                        break

            else:
                chunk_position.append(" ")

        else:
            pos_parent.append("null")
            right_sublings_list.append("null")
            chunk_position.append(" ")
    return pos_parent, right_sublings_list, chunk_position
示例#26
0
class Parser:
    def __init__(self):
        self.parser = CoreNLPParser()
        self.parser.session.trust_env = False

    def parse(self, sentence):
        return self.parser.raw_parse(sentence)
示例#27
0
文件: sense.py 项目: gumigumi4f/sv4d
class SynsetExample(object):
    def __init__(self):
        self.synset_example = {}
        self.tokenizer = CoreNLPParser(url='http://localhost:42636')

        self.use_babelnet = use_extended_gloss
        if self.use_babelnet:
            from py4j.java_gateway import JavaGateway
            gateway = JavaGateway()
            self.sense = gateway.entry_point

    def __getitem__(self, name):
        if name not in self.synset_example:
            self.synset_example[name] = self.get_synset_example(name)

        return self.synset_example[name]

    def get_synset_example(self, name):
        synset = wn.synset(name)
        if self.use_babelnet:
            synset_pos = synset.pos()
            if synset_pos == "s":
                synset_pos = "a"
            synset_id = 'wn:{}{}'.format(
                str(synset.offset()).zfill(8), synset_pos)
            example = self.sense.getExampleByWnSynsetId(synset_id)
            if not example:
                example = " ".join(synset.examples()).strip()
        else:
            example = " ".join(synset.examples()).strip()
        return [x.lower() for x in self.tokenizer.tokenize(example)]
示例#28
0
def formulate_question(question_sentence):
    """
    Formulates a Question object from question_sentence
    :param question_sentence: a string of the question sentence
    :return: a Question object representing the dependency structure of the question
    """
    # find the "question word" (see: "5 W's", "WH word") for the question
    q_parsed = next(CoreNLPParser().raw_parse(question_sentence))
    q_word = None
    # try out the normal constructions to find a question
    for subtree in q_parsed.subtrees():
        if subtree.label() in ["SBARQ", "SBAR", "SINV"]:
            for sub_subtree in subtree.subtrees():
                if sub_subtree.label(
                )[0] == "W" and sub_subtree.label()[0:2] != "WH":
                    q_word = (sub_subtree.leaves()[0], sub_subtree.label())
                    break
            break
    # the normal constructions didn't work; just grab the first question word
    if q_word is None:
        for subtree in q_parsed.subtrees():
            if subtree.label()[0] == "W" and subtree.label()[0:2] != "WH":
                q_word = (subtree.leaves()[0], subtree.label())

    return Question(get_dependency_parse(question_sentence), q_word)
def 提華語句法樹(bunji="我 喜歡 豬", url='http://localhost:9000'):
    try:
        句法分析器 = CoreNLPParser(url=url)
    except Warning as 錯誤:
        print('Warning=', 錯誤)

    分析結果指標 = 句法分析器.parse(simplify(bunji).split())
    該句結果字串 = next(分析結果指標)

    return 該句結果字串

    # 印字串
    # (ROOT (IP (NP (PN 我)) (VP (VV 喜欢) (NP (NN 猪)))))
    print('該句結果字串=', 該句結果字串)

    # 照字串印樹仔圖
    # ROOT
    #      |
    #      IP
    #   ___|____
    #  |        VP
    #  |    ____|___
    #  NP  |        NP
    #  |   |        |
    #  PN  VV       NN
    #  |   |        |
    #  我   喜欢       猪
    該句結果字串.pretty_print()

    ##### 樹仔字串提出原始字串
    a = Tree.fromstring("(ROOT (IP (NP (PN 我)) (VP (VV 喜欢) (NP (NN 猪)))))")
    # ['我', '喜欢', '猪']
    print(a.leaves())
    # (ROOT 我 喜欢 猪)
    print(a.flatten())
示例#30
0
 def __init__(self, fo_lang_code):
     # set up stanford nlp java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -preload
     # tokenize,ssplit,pos,lemma,parse,depparse  -status_port 9000 -port 9000 -timeout 15000 -serverProperties StanfordCoreNLP-
     # chinese.properties
     self.parser = CoreNLPParser()
     self.fo_lang_code = fo_lang_code
     self.preprocessor = Preprocessor()
示例#31
0
#!/usr/bin/python3
# coding: utf-8
##################################################################
## CoreNLP
# server$ cd ~/datasets/Lib/CoreNLP/stanford-corenlp-full-2018-01-31
# server$ java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer \ -preload tokenize,ssplit,pos,lemma,parse,depparse \ -status_port 9000 -port 9000 -timeout 15000
# client$ ssh -fN -L 9000:localhost:9000 [email protected] -p 23622  # 将本地 9000 (left) 转向到 lab_server 的 9000 (right)
from nltk.parse.corenlp import CoreNLPParser
stanford = CoreNLPParser()
str = 'proved to be fake, made-up'
token = list(stanford.tokenize(str)); print(token)  # ['proved', 'to', 'be', 'fake', ',', 'made-up']
str = 'proved to    be fake, made-up'  # 空格不影响
token = list(stanford.tokenize(str)); print(token)  # ['proved', 'to', 'be', 'fake', ',', 'made-up']

# ../jptstanford_corenlp/l1_tokenizer.py 也有相同的功能, 但那个需要 root 权限, 很烦