def test_complex1(): text = "A Tunisian court has jailed a Nigerian student for two years for helping young militants join an armed Islamic group in Lebanon, his lawyer said Wednesday." parse = """( (S (S (NP (DT A) (NNP Tunisian) (NN court)) (VP (AUXZ has) (VP (VBN jailed) (NP (DT a) (JJ Nigerian) (NN student)) (PP (IN for) (NP (CD two) (NNS years))) (PP (IN for) (S (VP (VBG helping) (S (NP (JJ young) (NNS militants)) (VP (VB join) (NP (NP (DT an) (JJ armed) (JJ Islamic) (NN group))f (PP (IN in) (NP (NNP Lebanon)))))))))))) (, ,) (NP (PRP$ his) (NN lawyer)) (VP (VBD said) (NP (NNP Wednesday))) (. .)))""" parsed = utilities._format_parsed_str(parse) dict = {u'test123': {u'sents': {u'0': {u'content': text, u'parsed': parsed}}, u'meta': {u'date': u'20010101'}}} return_dict = petrarch2.do_coding(dict,None) print(return_dict) assert return_dict['test123']['sents']['0']['events'] == [('TUNJUD','NGAEDU','173')]
def get_phrases(self, text, parse): parsed = utilities._format_parsed_str(parse) ddict = {u'test123': {u'sents': {u'0': {u'content': text, u'parsed': parsed}}, u'meta': {u'date': u'20010101'}}} return_dict = petrarch2.do_coding(ddict, None) n = return_dict['test123']['meta']['verbs']['nouns'] nouns = [i[0] for i in n] noun_coding = [i[1] for i in n] try: verbs = return_dict['test123']['meta']['verbs']['eventtext'].values()[0] except KeyError: print "No eventtext" verbs = "" try: verb_coding = return_dict['test123']['meta']['verbs']['eventtext'].keys()[0][2] except KeyError as e: print e verb_coding = "" phrase_dict = {"nouns" : nouns, "noun_coding" : noun_coding, "verbs" : verbs, "verb_coding" : verb_coding} return(phrase_dict)
def parse_sentence(stanford_parser, date, text): nlp = stanford_parser.parse_doc(text) parse = nlp['sentences'][0]['parse'] parsed = utilities._format_parsed_str(parse) dict = { u'doc': { u'sents': { u'0': { u'content': text, u'parsed': parsed } }, u'meta': { u'date': date } } } return_dict = petrarch2.do_coding(dict) has_events = False if "events" in return_dict['doc']['sents']['0']: #events = return_dict['doc']['sents']['0']['events'] has_events = True return_dict['has_events'] = has_events return_dict['sentence_date'] = date print(str(return_dict)) return return_dict
def test_simple2(): text = "Germany arrested France" parse = "(ROOT (S (NP (NNP Germany)) (VP (VBD arrested) (NP (NNP France)))))" parsed = utilities._format_parsed_str(parse) dict = {u'test123': {u'sents': {u'0': {u'content': text, u'parsed': parsed}}, u'meta': {u'date': u'20010101'}}} return_dict = petrarch2.do_coding(dict,None) print(return_dict) assert return_dict['test123']['sents']['0']['events'] == [('DEU','FRA','173')]
def get_phrases(doc): phrases_output=[] nouns=[] noun_coding=[] verbs=[] verb_coding=[] return_dict={} article_id=doc[0] date=date_formatter(doc[1]) #logger.info('Date: '+date) doc_id=doc[3] corenlpJsonData=json.loads(doc[2]) sentences=corenlpJsonData['sentences'] for sentence in sentences: sen_phrases_dict = {} sentenceJson= json.loads(json.dumps(sentence)) sentenceId=sentenceJson['sen_id'] sentenceTree=sentenceJson['tree'] sentenceData=sentenceJson['sentence'] parsed=utilities._format_parsed_str(sentenceTree) dict = {doc_id: {u'sents': {sentenceId: {u'content': sentenceData, u'parsed': parsed}}, u'meta': {u'date': date.encode()}}} try: return_dict = petrarch2.do_coding(dict) n = return_dict[doc_id]['meta']['verbs']['nouns'] nouns = [i[0] for i in n] noun_coding = [i[1] for i in n] verbs = return_dict[doc_id]['meta']['verbs']['eventtext'].values()[0] except: print "No eventtext" verbs = "" try: verb_coding = return_dict[doc_id]['meta']['verbs']['eventtext'].keys()[0][2] except KeyError as e: print e verb_coding = "" phrase_dict = {"nouns": nouns, "noun_coding": noun_coding, "verbs": verbs, "verb_coding": verb_coding} sen_phrases_dict={sentenceData: phrase_dict} phrases_output.append(sen_phrases_dict) #print phrases_output return (article_id, json.dumps(phrases_output),doc_id)
def read_json_(article_main): holding = {} sentence_limit = 7 article = None dateObject = None if ('date' not in article_main) or (len(article_main['date']) == 0): dateObject = datetime.now() else: dateObject = str(article_main['date'])[:-3] article = json.load(StringIO(article_main['output']), encoding='utf-8') try: entry_id = str(article['doc_id']) sent_dict = {} article_date = datetime.fromtimestamp(long(dateObject)).strftime('%Y%m%d') #datetime.strftime(dateObject, '%Y%m%d') meta_content = {'date': article_date} counter = 0 for sentence in article['sentences']: counter = counter + 1 sent_id = str(counter) if counter == sentence_limit: break #read only the first 7 sentences of a article parsed_text = utilities._format_parsed_str(sentence['parse_sentence']) sent_dict[sent_id] = {'content': sentence['sentence'], 'parsed': parsed_text, 'geo-location': json.load(StringIO(get_geo_location(sentence['sentence'])), encoding='utf-8')} content_dict = {'sents': sent_dict, 'meta': meta_content} holding[entry_id] = content_dict return holding except Exception as e: print(e.message) print(sys.exc_info()[0]) return {}
def read_json(jsonString): holding = {} sentence_limit = 7 article = None try: article = json.load(StringIO(jsonString), encoding='utf-8') except: print("Error while PARSING \n"+jsonString) dateObject = None if len(article['date_line']) == 0: dateObject = datetime.now() else: dateObject = parser.parse(article['date_line']) try: entry_id = str(article['doc_id']) sent_dict = {} article_date = datetime.strftime(dateObject, '%Y%m%d') meta_content = {'date': article_date, 'headline': article['head_line']} counter = 0 for sentence in article['sentences']: sent_id = str(sentence['sentence_id']) counter = counter + 1 if counter == sentence_limit: break #read only the first 7 sentences of a article parsed_text = utilities._format_parsed_str(sentence['parse_sentence']) sent_dict[sent_id] = {'content': sentence['sentence'], 'parsed': parsed_text, 'geo-location': json.load(StringIO(get_geo_location(sentence['sentence'])), encoding='utf-8')} content_dict = {'sents': sent_dict, 'meta': meta_content} holding[entry_id] = content_dict return holding except: print('Invalid JSON Format') print(sys.exc_info()[0]) return {}
def test_simple(): text = "Germany invaded France" parse = "(ROOT (S (NP (NNP Germany)) (VP (VBD invaded) (NP (NNP France)))))" parsed = utilities._format_parsed_str(parse) dict = { u'test123': { u'sents': { u'0': { u'content': text, u'parsed': parsed } }, u'meta': { u'date': u'20010101' } } } return_dict = petrarch2.do_coding(dict) print(return_dict) events = return_dict['test123']['sents']['0']['events'] print(events)
rows= c.fetchall() for row in rows: phrases_output=[] date=date_formatter(row[1]) doc_id=row[3] corenlpJsonData=json.loads(row[2]) sentences=corenlpJsonData['sentences'] for sentence in sentences: sen_phrases_dict = {} sentenceJson= json.loads(json.dumps(sentence)) sentenceId=sentenceJson['sen_id'] sentenceTree=sentenceJson['tree'] sentenceData=sentenceJson['sentence'] parsed=utilities._format_parsed_str(sentenceTree) dict = {doc_id: {u'sents': {sentenceId: {u'content': sentenceData, u'parsed': parsed}}, u'meta': {u'date': date.encode()}}} try: return_dict = petrarch2.do_coding(dict, None) n = return_dict[doc_id]['meta']['verbs']['nouns'] nouns = [i[0] for i in n] noun_coding = [i[1] for i in n] verbs = return_dict[doc_id]['meta']['verbs']['eventtext'].values()[0] except: print "No eventtext" verbs = "" try: verb_coding = return_dict[doc_id]['meta']['verbs']['eventtext'].keys()[0][2] except KeyError as e: print e verb_coding = ""
sen_out_dict= {} phrases_out_dict={} sen_out_records= [] sent_phrases_array={} sentences=data_json['sentences'] sen_failed=0; sen_parsed=0; for sentence in sentences: sen_dump=json.dumps(sentence) sen_json=json.loads(sen_dump) sen_id=sen_json['sen_id'].encode() sen_data=sen_json['sentence'] sen_parse=sen_json['tree'] text=sen_data parse=sen_parse parsed = utilities._format_parsed_str(parse) try: py_logger.debug('parsing : '+mongo_id) dict = {mongo_id: {u'sents': {sen_id: {u'content': text, u'parsed': parsed}},u'meta': {u'date': date.encode()}}} return_dict = petrarch2.do_coding(dict,None) return_dict= json.dumps(return_dict) #print return_dict output=return_dict sen_out_records.append(output) sen_parsed= sen_parsed+1; sent_phrases_array[text]= get_phrases('', text, parsed) except: sen_failed=sen_failed+1; py_logger.error('Parsing failed: '+mongo_id+' sen_id: '+sen_id)