예제 #1
0
    def run_nlp(self, language):
        # Make sure server is running properly (as explained in https://github.com/nltk/nltk/wiki/Stanford-CoreNLP-API-in-NLTK) :
        # might need root
        # english: java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -preload tokenize,ssplit,pos,lemma,ner,parse,depparse,sentiment -status_port 9000 -port 9000 -timeout 15000
        # the german implementation cannot do sentiment analysis, the predictions do not bear any relevance, keeping the code like that just makes it easier to maybe add seom sentiment analysis of the parsed german text in the future
        # if the service times out increasing the timeout helps. This usually happens when a sentence is too long to be handled within the given period.
        self.__check_language(language)
        util.time_log("starting NLP...")
        annotator_dict = {"annotators": "sentiment"}
        classifier = CoreNLPParser("http://localhost:9000")

        ret_list = []

        for k_iter in range(0, self.k):
            prediction = []
            for review in self.test_data_text(language, k_iter):
                response_dict = classifier.api_call(review,
                                                    properties=annotator_dict,
                                                    timeout=500)
                count = 0
                sentiment = 0.0
                for sentence in response_dict["sentences"]:
                    count += 1
                    sentiment += float(sentence["sentimentValue"])

                avg_sentiment = sentiment / count
                # a lot better results with >=2
                prediction.append(1 if avg_sentiment >= 2 else 0)
            ret_list.append(prediction)
        return ret_list
예제 #2
0
def annotate(sentence, lower=True):
    global client
   
    nlp = CoreNLPParser('http://localhost:9000')

    res = nlp.api_call(sentence,properties={'annotators': 'tokenize,ssplit'})     

        
    words, gloss, after = [], [], []
    
    print(sentence)
    for t in res['sentences']:
        for i in range(len(t['tokens'])):
            words.append(t['tokens'][i]['word'])
            gloss.append(t['tokens'][i]['originalText'])
            after.append(t['tokens'][i]['after'])
    if lower:
        words = [w.lower() for w in words]
    a={
        'gloss': gloss,
        'words': words,
        'after': after,
        }    
    print(a)        
    return {
        'gloss': gloss,
        'words': words,
        'after': after,
        }
예제 #3
0
def tag_file(inputfile, lemma=True):
    stanford_parser = CoreNLPParser()
    with open(inputfile) as fin:
        content = []
        for line in fin:
            linepos = []
            line = line.strip()
            json_result = stanford_parser.api_call(
                line, properties=additional_properties)
            for sentence in json_result['sentences']:
                for dpos in sentence['tokens']:
                    if lemma:
                        word = dpos['lemma']
                    else:
                        word = dpos['word']
                    pos = dpos['pos']
                    linepos.append((word, pos))
            if linepos:
                content.append(linepos[:])
    print content
예제 #4
0
def main():
    tokenizer = CoreNLPParser(url='http://localhost:42636')
    vocab = set()
    for line in open(sys.argv[1]):
        word = line.rstrip()
        vocab.add(word)

    document_buffer = ""
    token_buffer = []

    with open(sys.argv[2]) as fin, open(sys.argv[3], "w") as fout:
        start = time.time()

        for e, line in enumerate(fin):
            if line.strip() == "":
                continue
            elif line.strip().lower() != end_of_document_symbol:
                document_buffer += line.strip() + " <br> "
                if len(document_buffer) > 90000:
                    while True:
                        try:
                            json_result = tokenizer.api_call(
                                document_buffer,
                                properties=additional_properties)
                            break
                        except requests.exceptions.HTTPError:
                            pass
                    json_result = tokenizer.api_call(
                        document_buffer, properties=additional_properties)
                    for sentence in json_result['sentences']:
                        token_buffer += [(x["originalText"], x["pos"])
                                         for x in sentence['tokens']]
                    document_buffer = ""
            else:
                while True:
                    try:
                        json_result = tokenizer.api_call(
                            document_buffer, properties=additional_properties)
                        break
                    except requests.exceptions.HTTPError:
                        pass
                for sentence in json_result['sentences']:
                    token_buffer += [(x["originalText"], x["pos"])
                                     for x in sentence['tokens']]

                document = " ".join([
                    x.lower() + "__" +
                    pos if x != "." and x != "<br>" else "<br>"
                    for x, pos in token_buffer
                    if x.lower() in vocab or x in ["<br>", "."]
                ])
                sentences = [
                    x.strip() for x in document.split("<br>") if x.strip()
                ]
                fout.write("<doc>\n" + "\n".join(sentences) + "\n</doc>\n")

                document_buffer = ""
                token_buffer = []

            eta = 30749930 / (e + 1) * (time.time() - start) - (time.time() -
                                                                start)
            if (e + 1) % 500 == 0:
                sys.stdout.write("\rsent: %i/%i\tETA: %f" %
                                 (e + 1, 30749930, eta))
                sys.stdout.flush()
예제 #5
0
class StanTokenizer(Composable):
    def __init__(self):
        # Annotator dependencies, see https://stanfordnlp.github.io/CoreNLP/dependencies.html
        self.additional_properties = {
            'tokenize.options':
            'ptb3Escaping=false, unicodeQuotes=true, splitHyphenated=true, normalizeParentheses=false, normalizeOtherBrackets=false',
            'annotators': 'tokenize, ssplit, pos, lemma'
        }
        self.stanford_parser = CoreNLPParser()
        # The '-xmx2G' changes the maximum allowable RAM to 2GB instead of the default 512MB.
        internals.config_java(options='-xmx4G')

    def __call__(self,
                 value,
                 positions=False,
                 chars=False,
                 keeporiginal=False,
                 removestops=True,
                 start_pos=0,
                 start_char=0,
                 tokenize=True,
                 mode='',
                 **kwargs):
        """
        :param value: The unicode string to tokenize.
        :param positions: Whether to record token positions in the token.
        :param chars: Whether to record character offsets in the token.
        :param start_pos: The position number of the first token. For example,
            if you set start_pos=2, the tokens will be numbered 2,3,4,...
            instead of 0,1,2,...
        :param start_char: The offset of the first character of the first
            token. For example, if you set start_char=2, the text "aaa bbb"
            will have chars (2,5),(6,9) instead (0,3),(4,7).
        :param tokenize: if True, the text should be tokenized.
        """
        assert isinstance(value, text_type), "%s is not unicode" % repr(value)

        t = Token(positions,
                  chars,
                  removestops=removestops,
                  mode=mode,
                  **kwargs)
        if not tokenize:
            t.original = t.text = value
            t.boost = 1.0
            if positions:
                t.pos = start_pos
            if chars:
                t.startchar = start_char
                t.endchar = start_char + len(value)
            yield t

        else:
            pos = start_pos
            try:
                json_result = self.stanford_parser.api_call(
                    value, properties=self.additional_properties)
                for sentence in json_result['sentences']:
                    for token in sentence['tokens']:
                        if token:
                            t.text = token['word']
                            t.lemma = token['lemma']
                            t.pos = token['pos']
                            t.boost = 1.0
                            if keeporiginal:
                                t.original = token['originalText']
                            t.stopped = False
                            if positions:
                                t.pos = pos
                                pos += 1
                            if chars:
                                t.startchar = token['characterOffsetBegin']
                                t.endchar = token['characterOffsetEnd']
                            yield t
            except Exception as e:
                logging.critical(str(e))
                pass