import pprint import re import sys import os from ActorDictionaryCopy import ActorDictionary reload(sys) #sys.setdefaultencoding('utf8') pp = pprint.PrettyPrinter(indent=2) discard_words_set = set(['THE', 'A', 'AN', 'OF', 'IN', 'AT', 'OUT', '', ' ']) from EventCoder import EventCoder coder = EventCoder(petrGlobal={}) another_coder = EventCoder(petrGlobal=coder.get_PETRGlobals()) N = 10 new_actor_over_time = dict() #input_file = open('/root/Desktop/core_nlp_out_large.txt') #open('/root/test_pet') #input_file = open('/root/test_pet2') from StringIO import StringIO #folder_name = '/root/Desktop/files/' #folder_name = '/root/Desktop/dataset/' folder_name = '/Users/nikitakothari/Downloads/dataset_new/' actor_dict = ActorDictionary()
import sys import os from wiki import wikidata import pandas as pd from ActorDictionaryCopy import ActorDictionary reload(sys) #sys.setdefaultencoding('utf8') pp = pprint.PrettyPrinter(indent=2) discard_words_set = set(['THE', 'A', 'AN', 'OF', 'IN', 'AT', 'OUT', '', ' ']) from EventCoder import EventCoder coder = EventCoder(petrGlobal={}) another_coder = EventCoder(petrGlobal=coder.get_PETRGlobals()) N = 10 new_actor_over_time = dict() #input_file = open('/root/Desktop/core_nlp_out_large.txt') #open('/root/test_pet') #input_file = open('/root/test_pet2') from StringIO import StringIO #folder_name = '/root/Desktop/files/' #folder_name = '/root/Desktop/dataset/' #folder_name = '/Users/nikitakothari/Downloads/dataset_new/' actor_dict = ActorDictionary()
dc = DiscoverActor() def es_object(rdd): for x in rdd.collect(): obj = {} obj['text'] = x dc.discoverActor(x) if __name__ == "__main__": conf = SparkConf().setAppName("Political Application") sc = SparkContext(conf=conf) coder = EventCoder(petrGlobal={}) bMap = sc.broadcast(coder.get_PETRGlobals()) print(bMap.__class__) ssc = StreamingContext(sc, 5) constream = KafkaUtils.createStream(ssc=ssc, zkQuorum='localhost:2181', groupId='my_group', topics={'news-article': 1}) lines = constream.map(lambda x: x[1]) #lines.pprint(1) events_rdd = lines.map(map_articles) events_rdd.foreachRDD(es_object)
def code_articles(articleText, petrGlobals={}): coder = EventCoder(petrGlobal=petrGlobals) events_map = coder.encode(articleText) return str(events_map)
class DiscoverActor: pp = pprint.PrettyPrinter(indent=2) discard_words_set = set(['THE', 'A', 'AN', 'OF', 'IN', 'AT', 'OUT', '', ' ']) from EventCoder import EventCoder coder = EventCoder(petrGlobal={}) another_coder = EventCoder(petrGlobal=coder.get_PETRGlobals()) N = 10 new_actor_over_time = dict() from StringIO import StringIO actor_dict = ActorDictionary() total_new_actor_list = [] word_dic = dict() word_dict_count = dict() className = "myclass" def __init__(self): print self.className print "constr calling" def discoverActor(self, line): try: with open('/Users/nikitakothari/Downloads/dataset_new/new_actor.txt') as outfile: self.total_new_actor_list = json.load(outfile) except: self.total_new_actor_list = [] try: with open('/Users/nikitakothari/Downloads/dataset_new/new_actor_td_df.txt') as outfile: print "json data start" print json.load(outfile) print "json data end" new_actor_over_time = sorted(json.load(outfile), key=lambda x : (-x[1], x[0])) except: self.new_actor_over_time = dict() print line print '===================' if not line.startswith('{'): #skip the null entries print 'Not a useful line' return #pp.pprint(another_coder.encode(line)) new_actor_count = 0 dict_event = self.another_coder.encode(line) if dict_event is None: return new_actor_meta = dict() nouns = [] for k in dict_event.keys(): new_actor_meta['doc_id'] = k if 'sents' in dict_event[k]: if (dict_event[k]['sents'] is not None): keys = dict_event[k]['sents'].keys() if keys is not None: for l in keys: if 'meta' in dict_event[k]['sents'][l]: nouns += dict_event[k]['sents'][l]['meta']['nouns_not_matched'] new_actor_meta['new_actor'] = list(set(nouns)) #print new_actor_meta new_actor_freq = dict() #new_actor_freq['doc_id'] = new_actor_meta['doc_id'] total_count = 0 for item in new_actor_meta['new_actor']: sentences = json.load(StringIO(line), encoding='utf-8') count = 0 ner = set() for s in sentences['sentences']: #"(MONEY,$|48|million),(ORGANIZATION,United|Nations),(DATE,30|August|2016|today),(NUMBER,1.3|million),(LOCATION,Central|Africa|West|Central|Africa),(PERSON,WFP/Daouda|Guirou)" ner_text_list = '' if len(s['ner']) > 0: for ner_item in s['ner'].replace('),(', ':').split(':'): ner_item_list = ner_item.replace('(', '').replace(')', '').split(',') if len(ner_item_list) != 2: continue if ner_item_list[0] == 'PERSON': # or ner_item_list[0] == 'MISC' or ner_item_list[0] == 'ORGANIZATION': ner_text_list = ner_item_list[1] ner = ner | set([x.strip().upper() for x in ner_text_list.split('|')]) ner = ner - self.discard_words_set #ner = ner | set([x.strip().upper() for x in s['ner'].replace('ORGANIZATION', '').replace('LOCATION', '').replace('PERSON', '').replace('MISC', '').replace('DATE', '').replace('(', '').replace(')', '').replace('|', ',').split(',')]) #ner = ner | set([x.strip().upper() for x in ner_text_list.split('|')]) #ner = ner - discard_words_set #print ner new_actor_count=0 for s in sentences['sentences']: #if item in ner: content = s['sentence'] if item in ner: count += len(re.findall(item, content.upper())) if self.actor_dict.contains(item): continue #TO_DO: find NP from tree: findNP(NPParseTreeHashMap, item) new_actor_freq[item] = count if(count > 0): new_actor_count+= 1 #print new_actor_freq new_actor = dict() print line new_actor['doc_id'] = new_actor_meta['doc_id'] # try: # new_actor['doc_id'] = new_actor_meta['doc_id'] # except: # new_actor['doc_id'] = "abcd" # # new_actor['new_actor'] = new_actor_freq #print new_actor #if (new_actor_count > 0): print "new actor start" print new_actor print "new actor end" print "new actor list start" print self.new_actor_over_time print "new actor list end" self.total_new_actor_list.append(new_actor) with open('/Users/nikitakothari/Downloads/dataset_new/new_actor.txt', 'w') as outfile: json.dump(self.total_new_actor_list, outfile) total_document = 0.0 for item in self.total_new_actor_list: #{"new_actor": {"DHUBULIA": 2, "PRIMARY": 11, "NADIA\u00c2": 1}, "doc_id": "india_telegraph_bengal20160922.0001"} total_count = 0.0 if 'new_actor' in item and 'doc_id' in item: total_document += 1 for k in item['new_actor'].keys(): total_count += item['new_actor'][k] for k in item['new_actor'].keys(): tf = 1.00 * (item['new_actor'][k]/total_count) if k not in self.word_dic: self.word_dic[k] = tf self.word_dict_count[k] = 1 else: self.word_dic[k] += tf self.word_dict_count[k] += 1 for k in self.word_dic.keys(): self.word_dic[k] = self.word_dic[k] * (self.word_dict_count[k]/total_document) word_dic_sorted = sorted(self.word_dic.items(), key=lambda x : (-x[1], x[0]))[:N] #with open('/root/Desktop/new_actor_td_df.txt', 'w') as outfile: # json.dump(word_dic_sorted, outfile) for actor_item in word_dic_sorted: actor_noun = actor_item[0] if actor_noun in self.new_actor_over_time: self.new_actor_over_time[actor_noun] += 1 else: self.new_actor_over_time[actor_noun] = 1 with open('/Users/nikitakothari/Downloads/dataset_new/new_actor_td_df.txt', 'w') as outfile: json.dump(sorted(self.new_actor_over_time.items(), key=lambda x : (-x[1], x[0])), outfile)
def code_articles_(articleText, petrGlobals={}): coder = EventCoder(petrGlobal=petrGlobals) events_map = coder.gen_cameo_event(articleText) return str(events_map)
def code_articles(articleText, petrGlobals={}): coder = EventCoder(petrGlobal=petrGlobals) events_map = coder.gen_cameo_event(map_articles(articleText)) return str(events_map) def code_articles_(articleText, petrGlobals={}): coder = EventCoder(petrGlobal=petrGlobals) events_map = coder.gen_cameo_event(articleText) return str(events_map) if __name__ == "__main__": coder = EventCoder(petrGlobal={}) #article = '{ "type" : "story", "doc_id" : "nytasiapacific20160622.0002", "head_line" : "Lightning Ridge Journal: An Amateur Undertaking in Australian Mining Town With No Funeral Home", "date_line" : "Tue, 21 Jun 2016 03:52:15 GMT", "sentences" : [ { "sentence_id" : 1, "sentence" : "A Tunisian court has jailed a Nigerian student for two years for helping young militants join an armed Islamic group in Lebanon, his lawyer said Wednesday.", "parse_sentence" : "(ROOT (S (S (NP (DT A) (NNP Tunisian) (NN court)) (VP (VBZ has) (VP (VBN jailed) (NP (DT a) (NNP Nigerian) (NN student)) (PP (IN for) (NP (NP (CD two) (NNS years)) (PP (IN for) (S (VP (VBG helping) (S (NP (JJ young) (NNS militants)) (VP (VB join) (NP (DT an) (JJ armed) (JJ Islamic) (NN group)) (PP (IN in) (NP (NNP Lebanon))))))))))))) (, ,) (NP (PRP$ his) (NN lawyer)) (VP (VBD said) (NP (NNP Wednesday))) (. .)))" } ], "corref" : "" }' article = """{ "type":"story", "doc_id":"hindu_cities20160829.0009", "head_line":"Velankanni festival begins", "date_line" : "Tue, 21 Jun 2016 03:52:15 GMT", "sentences":[ { "sentence_id":1, "sentence":"NAGAPATTINAM :TAMILNADU: 29/08/2016 : Our Lady of Health Velankanni Basilica Annual Flag hoisting, in progress, as thousands of devotees witnessing, in Nagapattinam district.Photo: B.Velankanni Raj The 11-day annual feast of Shrine Basilica of Our Lady of Health, popularly called Annai Velankanni Matha, began in Velankanni on Monday evening with a huge procession and hoisting of the holy flag by Most Rev. M. Devadass Ambrose, Bishop of Thanjavur.", "parse_sentence":"(ROOT (NP (NP (NNP NAGAPATTINAM)) (: :) (NP (NP (NNP TAMILNADU)) (: :) (NP (NP (CD 29/08/2016)) (: :) (NP (NP (NP (PRP$ Our) (NN Lady)) (PP (IN of) (NP (NP (NNP Health) (NNP Velankanni) (NN Basilica) (JJ Annual) (NN Flag)) (VP (VBG hoisting) (, ,) (PP (IN in) (NP (NN progress))) (, ,) (SBAR (IN as) (S (NP (NP (NP (NP (NNS thousands)) (PP (IN of) (NP (NNS devotees)))) (VP (VBG witnessing) (, ,) (PP (IN in) (NP (NNP Nagapattinam) (NNP district.Photo))))) (: :) (NP (NP (NNP B.Velankanni) (NNP Raj) (NNP The) (JJ 11-day) (JJ annual) (NN feast)) (PP (IN of) (NP (NP (NNP Shrine) (NN Basilica)) (PP (IN of) (NP (NP (PRP$ Our) (NN Lady)) (PP (IN of) (NP (NNP Health))))))) (, ,) (VP (ADVP (RB popularly)) (VBN called) (S (NP (NNP Annai) (NNP Velankanni) (NNP Matha)))) (, ,))) (VP (VBD began) (PP (IN in) (NP (NNP Velankanni))) (PP (IN on) (NP (NP (NNP Monday) (NN evening)) (PP (IN with) (NP (NP (DT a) (JJ huge) (NN procession)) (CC and) (NP (NP (NN hoisting)) (PP (IN of) (NP (DT the) (JJ holy) (NN flag))) (PP (IN by) (NP (JJS Most) (NNP Rev.) (NNP M.) (NNP Devadass) (NNP Ambrose))))))))))))))) (, ,) (NP (NP (NNP Bishop)) (PP (IN of) (NP (NNP Thanjavur))))))) (. .)))", "token":"NAGAPATTINAM,:,TAMILNADU,:,29/08/2016,:,Our,Lady,of,Health,Velankanni,Basilica,Annual,Flag,hoisting,,,in,progress,,,as,thousands,of,devotees,witnessing,,,in,Nagapattinam,district.Photo,:,B.Velankanni,Raj,The,11-day,annual,feast,of,Shrine,Basilica,of,Our,Lady,of,Health,,,popularly,called,Annai,Velankanni,Matha,,,began,in,Velankanni,on,Monday,evening,with,a,huge,procession,and,hoisting,of,the,holy,flag,by,Most,Rev.,M.,Devadass,Ambrose,,,Bishop,of,Thanjavur,.", "lemma":"NAGAPATTINAM,:,TAMILNADU,:,29/08/2016,:,we,lady,of,Health,Velankanni,basilica,annual,flag,hoist,,,in,progress,,,as,thousand,of,devotee,witness,,,in,Nagapattinam,district.Photo,:,B.Velankanni,Raj,The,11-day,annual,feast,of,Shrine,basilica,of,we,lady,of,Health,,,popularly,call,Annai,Velankanni,Matha,,,begin,in,Velankanni,on,Monday,evening,with,a,huge,procession,and,hoisting,of,the,holy,flag,by,most,Rev.,M.,Devadass,Ambrose,,,Bishop,of,Thanjavur,.",
from EventCoder import EventCoder coder = EventCoder(petrGlobal={}) another_coder = EventCoder(petrGlobal=coder.get_PETRGlobals()) input_file = open('core_nlp_out.txt') for line in input_file: print line print '===================' print another_coder.encode(line) # from dateutil import parser # from datetime import datetime # # dateObject = parser.parse("") # # article_date = datetime.strftime(dateObject, '%Y%m%d') # # # print article_date