def main(giga_db_loc, n_docs, pos_tag=False, parse=False): docs = Gigaword(giga_db_loc, limit=n_docs) if parse: mode = "justparse" elif pos_tag: mode = "pos" else: mode = "tokenize" nlp = sockwrap.SockWrap(mode=mode, corenlp_libdir=STANFORD_PATH, configfile=CONFIG_PATH, server_port=12342) for doc in docs: parse = nlp.parse_doc(doc)
def __init__(self, cfilt_preorder_dir, stanford_core_dir, reorder_mode='generic', poolsize=1): """ """ self._cfilt_preorder_dir = cfilt_preorder_dir self._stanford_core_dir = stanford_core_dir self._poolsize = poolsize self._reorder_mode = reorder_mode self._script_name = 'codkiller-V1.0{}.pl'.format( '' if reorder_mode == 'hindi_tuned' else '_' + reorder_mode) self._pipeline = sockwrap.SockWrap( "justparse", corenlp_libdir=self._stanford_core_dir, configfile=self._cfilt_preorder_dir + '/reorderEnglish.ini')
""" Input on stdin. Every line is one document. Output on stdout. Every line is one JSON object. USAGE proc_doc_lines.py MODE < inputfile > outputfile e.g. echo -e "this is one doc.\nHere is another." | python proc_doc_lines.py pos a bunch of crap will appear on stderr, but the only two stdout lines will be something like: {"sentences":[{"pos":["DT","VBZ","CD","NN","."],"lemmas":["this","be","one","doc","."],"tokens":["this","is","one","doc","."],"char_offsets":[[0,4],[5,7],[8,11],[12,15],[15,16]]}]} {"sentences":[{"pos":["RB","VBZ","DT","."],"lemmas":["here","be","another","."],"tokens":["Here","is","another","."],"char_offsets":[[0,4],[5,7],[8,15],[15,16]]}]} """ import sys mode = sys.argv[1] import stanford_corenlp_pywrapper.sockwrap as sw ss = sw.SockWrap(mode) # need to override corenlp_jars for line in sys.stdin: text = line.rstrip("\n").decode('utf8', 'replace') jdoc = ss.parse_doc(text, raw=True) assert "\n" not in jdoc print jdoc
def stanford_parse(coll, stories, stanford): """ Runs stories pulled from the MongoDB instance through CoreNLP. Updates the database entry with the parsed sentences. Currently set to run the first 6 sentences. Parameters ---------- coll: pymongo.collection.Collection. Collection within MongoDB that holds the scraped news stories. stories: pymongo.cursor.Cursor. Stories pulled from the MongoDB instance. stanford: String. Directory path for Stanford CoreNLP. """ logger = logging.getLogger('stanford') logger.info('Setting up CoreNLP.') print "\nSetting up StanfordNLP. The program isn't dead. Promise." stanford_parser = sockwrap.SockWrap(mode='justparse', configfile='stanford_config.ini', corenlp_libdir=stanford) total = stories.count() print "Stanford setup complete. Starting parse of {} stories...".format( total) logger.info('Finished CoreNLP setup.') for story in stories: #print 'Processing story {}. {}'.format(story['_id'], # datetime.datetime.now()) logger.info('\tProcessing story {}'.format(story['_id'])) if story['stanford'] == 1: #print '\tStory {} already parsed.'.format(story['_id']) logger.info('\tStory {} already parsed.'.format(story['_id'])) pass else: content = _sentence_segmenter(story['content'])[:7] parsed = [] for sent in content: try: stanford_result = stanford_parser.parse_doc(sent) parsed.append(stanford_result['sentences'][0]['parse']) except Exception as e: print 'Error on story {}. ¯\_(ツ)_/¯. {}'.format( story['_id'], e) logger.warning('\tError on story {}. {}'.format( story['_id'], e)) coll.update({"_id": story['_id']}, {"$set": { 'parsed_sents': parsed, 'stanford': 1 }}) print 'Done with StanfordNLP parse...\n\n' logger.info('Done with CoreNLP parse.')
""" Input on stdin. Every line is one document. Output on stdout. Every line is one JSON object. USAGE proc_doc_lines.py MODE < inputfile > outputfile e.g. echo -e "this is one doc.\nHere is another." | python proc_doc_lines.py pos a bunch of crap will appear on stderr, but the only two stdout lines will be something like: {"sentences":[{"pos":["DT","VBZ","CD","NN","."],"lemmas":["this","be","one","doc","."],"tokens":["this","is","one","doc","."],"char_offsets":[[0,4],[5,7],[8,11],[12,15],[15,16]]}]} {"sentences":[{"pos":["RB","VBZ","DT","."],"lemmas":["here","be","another","."],"tokens":["Here","is","another","."],"char_offsets":[[0,4],[5,7],[8,15],[15,16]]}]} """ import sys mode = sys.argv[1] import stanford_corenlp_pywrapper.sockwrap as sw ss = sw.SockWrap(mode) for line in sys.stdin: text = line.rstrip("\n").decode('utf8', 'replace') jdoc = ss.parse_doc(text, raw=True) assert "\n" not in jdoc print jdoc
def main(): INPUT_FILENAME = "input.txt" OUTPUT_FILENAME = "results.txt" # Get running mode. 'Standard' produces events normally. 'Null Actors' records the actor labels. parser = argparse.ArgumentParser( description='Run petrarch in standard mode or null-actors mode') parser.add_argument("event_mode", help="Specify [s]tandard or [n]ull actors mode.") args = parser.parse_args() print("Running mode: " + args.event_mode) config_file = "not_specified" mode = args.event_mode.lower() if mode.startswith("s"): config_file = "PETR_config_standard.ini" elif mode.startswith("n"): config_file = "PETR_config_null_actors.ini" else: print( "Must specify [s]tandard or [n]ull actors mode as an argument.\nExiting." ) sys.exit() # Read input entries = [] try: with open(INPUT_FILENAME, "r") as f: lines = f.readlines() except: logging.error("Input file: " + INPUT_FILENAME + " not able to be read.") sys.exit() for line in lines: line = line.strip() if (len(line) > 0) and not line.startswith("#"): entries.append(line) config_dir = 'data/config/' config = petrarch2.utilities._get_data(config_dir, config_file) config_path = config_dir + config_file print("Reading config: " + config_path) petrarch2.PETRreader.parse_Config(config) print("reading dicts") petrarch2.read_dictionaries() stanford_parser = sockwrap.SockWrap( mode='justparse', configfile='stanford_config.ini', corenlp_libdir="stanford_corenlp_pywrapper/stanford-corenlp") # test = stanford_parser.parse_doc("hello world. how are you?") # print(test) #test_simple() results = [] for entry in entries: date = entry[:8] sent = entry[9:].decode('utf-8') sent = unicode(sent) t = type(sent) coding_result = parse_sentence(stanford_parser, date, sent) results.append(coding_result) try: with open(OUTPUT_FILENAME, 'w') as f: f.write("SENTENCE PARSE RESULTS\n") f.write(str(datetime.datetime.now()) + "\n") f.write("# sentences: " + str(len(results)) + "\n") if mode.startswith('s'): f.write("Coding Mode: Null Actors = FALSE\n\n") else: f.write("Coding Mode: Null Actors = TRUE\n\n") for coding_result in results: if coding_result['has_events']: write_has_events(f, coding_result) else: write_non_event(f, coding_result) except: logging.error("Error writing output") sys.exit()