Exemplo n.º 1
0
def main(giga_db_loc, n_docs, pos_tag=False, parse=False):
    docs = Gigaword(giga_db_loc, limit=n_docs)
    if parse:
        mode = "justparse"
    elif pos_tag:
        mode = "pos"
    else:
        mode = "tokenize"
    nlp = sockwrap.SockWrap(mode=mode,
                            corenlp_libdir=STANFORD_PATH,
                            configfile=CONFIG_PATH,
                            server_port=12342)
    for doc in docs:
        parse = nlp.parse_doc(doc)
Exemplo n.º 2
0
    def __init__(self,
                 cfilt_preorder_dir,
                 stanford_core_dir,
                 reorder_mode='generic',
                 poolsize=1):
        """
        """
        self._cfilt_preorder_dir = cfilt_preorder_dir
        self._stanford_core_dir = stanford_core_dir
        self._poolsize = poolsize
        self._reorder_mode = reorder_mode

        self._script_name = 'codkiller-V1.0{}.pl'.format(
            '' if reorder_mode == 'hindi_tuned' else '_' + reorder_mode)

        self._pipeline = sockwrap.SockWrap(
            "justparse",
            corenlp_libdir=self._stanford_core_dir,
            configfile=self._cfilt_preorder_dir + '/reorderEnglish.ini')
Exemplo n.º 3
0
"""
Input on stdin.  Every line is one document.
Output on stdout.  Every line is one JSON object.

USAGE
proc_doc_lines.py MODE < inputfile > outputfile

e.g.
echo -e "this is one doc.\nHere is another." | python proc_doc_lines.py pos

a bunch of crap will appear on stderr, but the only two stdout lines will be something like:

{"sentences":[{"pos":["DT","VBZ","CD","NN","."],"lemmas":["this","be","one","doc","."],"tokens":["this","is","one","doc","."],"char_offsets":[[0,4],[5,7],[8,11],[12,15],[15,16]]}]}
{"sentences":[{"pos":["RB","VBZ","DT","."],"lemmas":["here","be","another","."],"tokens":["Here","is","another","."],"char_offsets":[[0,4],[5,7],[8,15],[15,16]]}]}
"""

import sys
mode = sys.argv[1]

import stanford_corenlp_pywrapper.sockwrap as sw
ss = sw.SockWrap(mode)  # need to override corenlp_jars

for line in sys.stdin:
    text = line.rstrip("\n").decode('utf8', 'replace')
    jdoc = ss.parse_doc(text, raw=True)
    assert "\n" not in jdoc
    print jdoc
Exemplo n.º 4
0
def stanford_parse(coll, stories, stanford):
    """
    Runs stories pulled from the MongoDB instance through CoreNLP. Updates
    the database entry with the parsed sentences. Currently set to run the
    first 6 sentences.

    Parameters
    ----------

    coll: pymongo.collection.Collection.
            Collection within MongoDB that holds the scraped news stories.

    stories: pymongo.cursor.Cursor.
                Stories pulled from the MongoDB instance.

    stanford: String.
                Directory path for Stanford CoreNLP.
    """
    logger = logging.getLogger('stanford')

    logger.info('Setting up CoreNLP.')
    print "\nSetting up StanfordNLP. The program isn't dead. Promise."
    stanford_parser = sockwrap.SockWrap(mode='justparse',
                                        configfile='stanford_config.ini',
                                        corenlp_libdir=stanford)

    total = stories.count()
    print "Stanford setup complete. Starting parse of {} stories...".format(
        total)
    logger.info('Finished CoreNLP setup.')

    for story in stories:
        #print 'Processing story {}. {}'.format(story['_id'],
        #                                       datetime.datetime.now())
        logger.info('\tProcessing story {}'.format(story['_id']))

        if story['stanford'] == 1:
            #print '\tStory {} already parsed.'.format(story['_id'])
            logger.info('\tStory {} already parsed.'.format(story['_id']))
            pass
        else:
            content = _sentence_segmenter(story['content'])[:7]

            parsed = []
            for sent in content:
                try:
                    stanford_result = stanford_parser.parse_doc(sent)
                    parsed.append(stanford_result['sentences'][0]['parse'])

                except Exception as e:
                    print 'Error on story {}. ¯\_(ツ)_/¯. {}'.format(
                        story['_id'], e)
                    logger.warning('\tError on story {}. {}'.format(
                        story['_id'], e))

            coll.update({"_id": story['_id']},
                        {"$set": {
                            'parsed_sents': parsed,
                            'stanford': 1
                        }})

    print 'Done with StanfordNLP parse...\n\n'
    logger.info('Done with CoreNLP parse.')
Exemplo n.º 5
0
"""
Input on stdin.  Every line is one document.
Output on stdout.  Every line is one JSON object.

USAGE
proc_doc_lines.py MODE < inputfile > outputfile

e.g.
echo -e "this is one doc.\nHere is another." | python proc_doc_lines.py pos

a bunch of crap will appear on stderr, but the only two stdout lines will be something like:

{"sentences":[{"pos":["DT","VBZ","CD","NN","."],"lemmas":["this","be","one","doc","."],"tokens":["this","is","one","doc","."],"char_offsets":[[0,4],[5,7],[8,11],[12,15],[15,16]]}]}
{"sentences":[{"pos":["RB","VBZ","DT","."],"lemmas":["here","be","another","."],"tokens":["Here","is","another","."],"char_offsets":[[0,4],[5,7],[8,15],[15,16]]}]}
"""

import sys
mode = sys.argv[1]

import stanford_corenlp_pywrapper.sockwrap as sw
ss = sw.SockWrap(mode)

for line in sys.stdin:
    text = line.rstrip("\n").decode('utf8', 'replace')
    jdoc = ss.parse_doc(text, raw=True)
    assert "\n" not in jdoc
    print jdoc
Exemplo n.º 6
0
def main():

    INPUT_FILENAME = "input.txt"
    OUTPUT_FILENAME = "results.txt"

    # Get running mode. 'Standard' produces events normally. 'Null Actors' records the actor labels.
    parser = argparse.ArgumentParser(
        description='Run petrarch in standard mode or null-actors mode')
    parser.add_argument("event_mode",
                        help="Specify [s]tandard or [n]ull actors mode.")
    args = parser.parse_args()
    print("Running mode: " + args.event_mode)

    config_file = "not_specified"
    mode = args.event_mode.lower()
    if mode.startswith("s"):
        config_file = "PETR_config_standard.ini"
    elif mode.startswith("n"):
        config_file = "PETR_config_null_actors.ini"
    else:
        print(
            "Must specify [s]tandard or [n]ull actors mode as an argument.\nExiting."
        )
        sys.exit()

    # Read input
    entries = []
    try:
        with open(INPUT_FILENAME, "r") as f:
            lines = f.readlines()

    except:
        logging.error("Input file: " + INPUT_FILENAME +
                      " not able to be read.")
        sys.exit()

    for line in lines:
        line = line.strip()
        if (len(line) > 0) and not line.startswith("#"):
            entries.append(line)

    config_dir = 'data/config/'
    config = petrarch2.utilities._get_data(config_dir, config_file)
    config_path = config_dir + config_file
    print("Reading config: " + config_path)

    petrarch2.PETRreader.parse_Config(config)
    print("reading dicts")
    petrarch2.read_dictionaries()

    stanford_parser = sockwrap.SockWrap(
        mode='justparse',
        configfile='stanford_config.ini',
        corenlp_libdir="stanford_corenlp_pywrapper/stanford-corenlp")

    # test = stanford_parser.parse_doc("hello world. how are you?")
    # print(test)
    #test_simple()
    results = []
    for entry in entries:
        date = entry[:8]
        sent = entry[9:].decode('utf-8')
        sent = unicode(sent)
        t = type(sent)
        coding_result = parse_sentence(stanford_parser, date, sent)
        results.append(coding_result)

    try:
        with open(OUTPUT_FILENAME, 'w') as f:

            f.write("SENTENCE PARSE RESULTS\n")
            f.write(str(datetime.datetime.now()) + "\n")
            f.write("# sentences: " + str(len(results)) + "\n")

            if mode.startswith('s'):
                f.write("Coding Mode:  Null Actors = FALSE\n\n")
            else:
                f.write("Coding Mode:  Null Actors = TRUE\n\n")

            for coding_result in results:
                if coding_result['has_events']:
                    write_has_events(f, coding_result)
                else:
                    write_non_event(f, coding_result)

    except:
        logging.error("Error writing output")
        sys.exit()