def transform(self, documents): logging.info("Starting documents parsing with StanfordCoreNLP...") for i, doc in enumerate(documents): processed = False if not 'article' in doc.ext.keys(): processed = True doc.ext['article'] = [] for sentence in doc.ext['sentences']: doc.ext['article'].extend( stanfordParse(sentence)['sentences']) # What I'd like to do, but damn slow: # stanfordOutput = stanfordParse(doc.text) # doc.ext['article'] = stanfordOutput['sentences'] # doc.ext['coref'] = stanfordOutput['coref'] if not 'models' in doc.ext.keys(): processed = True doc.ext['models'] = [] for model in doc.models: doc.ext['models'].extend( stanfordParse(model)['sentences']) if processed: logging.info("Processed document %i/%i" % (i + 1, len(documents))) else: logging.info("Document %i/%i was already processed" % (i + 1, len(documents))) return documents
import nlpio,json if __name__ == '__main__': print json.dumps(nlpio.stanfordParse('The world is so pretty.'),indent=4) print json.dumps(nlpio.stanfordParse('I like trains. They are nice and clean, almost as clean as their tracks.'),indent=4)
def transform(self,documents): for doc in documents: print 'Parsing: ' + doc.name doc.ext['coreNLP'] = nlpio.stanfordParse(doc.text) return documents