예제 #1
0
class RPG:
    def __init__(self, storage_path, corenlp_url):
        self.storage_path = os.path.abspath(storage_path)
        self.parser = CoreNLPParser(url=corenlp_url)

    def create_question_json(self, filepath: str, max_question_word_count: int = 30, update: bool = False) -> str:
        newfilepath = os.path.join(
            self.storage_path, 'questions_' + os.path.basename(filepath))
        if not update and os.path.exists(newfilepath):
            return newfilepath
        wfile = open(newfilepath, 'w')

        filepath = os.path.abspath(filepath)
        rfile = open(filepath, 'r')

        question_symbols = {'SBARQ', 'SQ'}

        for line in rfile:
            answer = next(rfile)
            json_line = json.loads(line)
            property = str(json_line['body'])
            if len(property.split()) > max_question_word_count:
                continue
            try:
                parse_trees = self.parser.parse_text(property)
                for parse_tree in parse_trees:
                    for subtree in parse_tree.subtrees():
                        if subtree.label() in question_symbols:
                            wfile.write(line)
                            wfile.write(answer)
                            raise Exception('Prevent duplicate write')
            except Exception as e:
                pass
        return newfilepath

    def perform_ner(self, filepath: str, entity_type: str = 'number', update: bool = False) -> str:
        newfilepath = os.path.join(
            self.storage_path, entity_type + '_' + os.path.basename(filepath))
        if not update and os.path.exists(newfilepath):
            return newfilepath
        wfile = open(newfilepath, 'w')

        filepath = os.path.abspath(filepath)
        rfile = open(filepath, 'r')

        recognizer = NumberRecognizer(Culture.English)
        model = recognizer.get_number_model()
        for line in rfile:
            answer = next(rfile)

            answer_json = json.loads(answer)

            text = str(answer_json['body'])
            try:
                result = model.parse(text)
                if result:
                    for x in result:
                        if x.type_name == entity_type:
                            wfile.write(line)
                            wfile.write(answer)
                            break
            except Exception as e:
                print(e)
        return newfilepath

    def create_subreddit_json(self, filepath: str, subreddit: str, update: bool = False) -> str:
        subreddit = subreddit.lower()

        newfilepath = os.path.join(
            self.storage_path, subreddit + '_' + os.path.basename(filepath))
        if not update and os.path.exists(newfilepath):
            return newfilepath
        wfile = open(newfilepath, 'w')

        filepath = os.path.abspath(filepath)
        rfile = open(filepath, 'r')

        for line in rfile:
            try:
                json_line = json.loads(line)
                if json_line['subreddit'].lower() == subreddit:
                    wfile.write(line)
            except Exception as e:
                print(e)

        rfile.close()
        wfile.close()
        return newfilepath

    def find_comment_pairs(self, filepath: str, min_score: int = 0, update: bool = False) -> str:
        newfilepath = os.path.join(
            self.storage_path, 'pairs_' + os.path.basename(filepath))
        if not update and os.path.exists(newfilepath):
            return newfilepath
        wfile = open(newfilepath, 'w')

        filepath = os.path.abspath(filepath)
        rfile = open(filepath, 'r')

        for line in rfile:
            try:
                comment = json.loads(line)
            except Exception as e:
                print(e)
                continue
            if comment['score'] > min_score:
                rfile_comparison = open(filepath, 'r')
                highest_score_comparison = {'score': 0}
                for line_comparison in rfile_comparison:
                    try:
                        comment_comparison = json.loads(line_comparison)
                    except Exception as e:
                        print(e)
                        continue
                    if (comment['parent_id'][3:] == comment_comparison['id'] and
                            comment_comparison['score'] > highest_score_comparison['score']):
                        highest_score_comparison = comment_comparison
                if highest_score_comparison['score'] > 0:
                    wfile.write(json.dumps(highest_score_comparison) + '\n')
                    wfile.write(line)
        return newfilepath
    
    def perform_all(self, filepath: str, subreddits: list, update: bool):
        for subreddit in subreddits:
            sub_time = time.time()
            sub = self.create_subreddit_json(filepath, subreddit, update = update)
            print('Subreddit comments file created in {:.2f} seconds'.format(time.time() - sub_time))

            pairs_time = time.time()
            pairs = self.find_comment_pairs(sub, update = update)
            print('Comment pairs file created in {:.2f} seconds'.format(time.time() - pairs_time))

            questions_time = time.time()
            questions = self.create_question_json(pairs, update = update)
            print('Question-answer file created in {:.2f} seconds'.format(time.time() - questions_time))

            ner_time = time.time()
            ner = self.perform_ner(questions)
            print('Entity file created in {:.2f} seconds'.format(time.time() - ner_time))
예제 #2
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat Oct 27 13:05:59 2018

@author: raja
"""

import pandas as pd
df = pd.read_csv('text_alone.csv', header=None,delimiter="\t")

from nltk.parse.corenlp import CoreNLPParser
parser = CoreNLPParser(url='http://localhost:9010')

i=len(df[0])
pp=[]
for one_t in df[0]:
    text=one_t    
    #text =  "The runner scored from second on a base hit"
    
    parse123 = next(parser.parse_text(text))
    #Flattenning the tree
    parse_string = ' '.join(str(parse123).split()) 
    pp.append(parse_string)
    i=i-1
    if i%10==0:
        print(i)
        
ppdata=pd.DataFrame(pp)
ppdata.to_csv('pos_tree1.csv',index=False,header=False)
예제 #3
0
import os
import nltk
from nltk.parse.corenlp import CoreNLPServer
from nltk.parse.corenlp import CoreNLPParser
from nltk.parse.corenlp import CoreNLPDependencyParser

STANFORD = "stanford-corenlp-full-2018-10-05"

jars = (
    os.path.join(STANFORD, "stanford-corenlp-3.9.2.jar"),
    os.path.join(STANFORD, "stanford-corenlp-3.9.2-models.jar"),
)

text = "turn right and go up the stairs and stand at the top."
#text = "Walk out of the closet and into the hallway. Walk through the hallway entrance on the left. Stop just inside the entryway."
#text = "Turn, putting the exit of the building on your left. Walk to the end of the entrance way and turn left. Travel across the kitchen area with the counter and chairs on your right. Continue straight until you reach the dining room. Enter the room and stop and wait one meter from the closest end of the long dining table."
print(text)
with CoreNLPServer(*jars):

    parser = CoreNLPParser()
    for i in parser.parse_text(text):
        print(i)

    parser = CoreNLPDependencyParser()
    for i in parser.raw_parse(text):
        print(i)