示例#1
0
import pprint
import re
import sys
import os

from ActorDictionaryCopy import ActorDictionary
reload(sys)
#sys.setdefaultencoding('utf8')

pp = pprint.PrettyPrinter(indent=2)

discard_words_set = set(['THE', 'A', 'AN', 'OF', 'IN', 'AT', 'OUT', '', ' '])

from EventCoder import EventCoder

coder = EventCoder(petrGlobal={})

another_coder = EventCoder(petrGlobal=coder.get_PETRGlobals())
N = 10
new_actor_over_time = dict()

#input_file = open('/root/Desktop/core_nlp_out_large.txt') #open('/root/test_pet')
#input_file = open('/root/test_pet2')

from StringIO import StringIO

#folder_name = '/root/Desktop/files/'
#folder_name = '/root/Desktop/dataset/'
folder_name = '/Users/nikitakothari/Downloads/dataset_new/'

actor_dict = ActorDictionary()
示例#2
0
import sys
import os
from wiki import wikidata
import pandas as pd

from ActorDictionaryCopy import ActorDictionary
reload(sys)
#sys.setdefaultencoding('utf8')

pp = pprint.PrettyPrinter(indent=2)

discard_words_set = set(['THE', 'A', 'AN', 'OF', 'IN', 'AT', 'OUT', '', ' '])

from EventCoder import EventCoder

coder = EventCoder(petrGlobal={})

another_coder = EventCoder(petrGlobal=coder.get_PETRGlobals())
N = 10
new_actor_over_time = dict()

#input_file = open('/root/Desktop/core_nlp_out_large.txt') #open('/root/test_pet')
#input_file = open('/root/test_pet2')

from StringIO import StringIO

#folder_name = '/root/Desktop/files/'
#folder_name = '/root/Desktop/dataset/'
#folder_name = '/Users/nikitakothari/Downloads/dataset_new/'

actor_dict = ActorDictionary()
示例#3
0
dc = DiscoverActor()


def es_object(rdd):
    for x in rdd.collect():
        obj = {}
        obj['text'] = x
        dc.discoverActor(x)


if __name__ == "__main__":

    conf = SparkConf().setAppName("Political Application")
    sc = SparkContext(conf=conf)

    coder = EventCoder(petrGlobal={})

    bMap = sc.broadcast(coder.get_PETRGlobals())
    print(bMap.__class__)

    ssc = StreamingContext(sc, 5)
    constream = KafkaUtils.createStream(ssc=ssc,
                                        zkQuorum='localhost:2181',
                                        groupId='my_group',
                                        topics={'news-article': 1})

    lines = constream.map(lambda x: x[1])
    #lines.pprint(1)
    events_rdd = lines.map(map_articles)
    events_rdd.foreachRDD(es_object)
示例#4
0
def code_articles(articleText, petrGlobals={}):
    coder = EventCoder(petrGlobal=petrGlobals)
    events_map = coder.encode(articleText)
    return str(events_map)
示例#5
0
class DiscoverActor:

    pp = pprint.PrettyPrinter(indent=2)
    discard_words_set = set(['THE', 'A', 'AN', 'OF', 'IN', 'AT', 'OUT', '', ' '])
    from EventCoder import EventCoder
    coder = EventCoder(petrGlobal={})
    another_coder = EventCoder(petrGlobal=coder.get_PETRGlobals())
    N = 10
    new_actor_over_time = dict()
    from StringIO import StringIO
    actor_dict = ActorDictionary()
    total_new_actor_list = []
    word_dic = dict()
    word_dict_count = dict()

    className = "myclass"

    def __init__(self):
        print self.className
        print "constr calling"

    def discoverActor(self, line):

        try:
            with open('/Users/nikitakothari/Downloads/dataset_new/new_actor.txt') as outfile:
                self.total_new_actor_list = json.load(outfile)
        except:
            self.total_new_actor_list = []

        try:
            with open('/Users/nikitakothari/Downloads/dataset_new/new_actor_td_df.txt') as outfile:
                print "json data start"
                print json.load(outfile)
                print "json data end"
                new_actor_over_time = sorted(json.load(outfile), key=lambda x : (-x[1], x[0]))
        except:
            self.new_actor_over_time = dict()


        print line
        print '==================='

        if not line.startswith('{'): #skip the null entries
            print 'Not a useful line'
            return
        #pp.pprint(another_coder.encode(line))
        new_actor_count = 0
        dict_event = self.another_coder.encode(line)
        if dict_event is None:
            return

        new_actor_meta = dict()
        nouns = []


        for k in dict_event.keys():
            new_actor_meta['doc_id'] = k
            if 'sents' in dict_event[k]:
                if (dict_event[k]['sents'] is not None):
                    keys = dict_event[k]['sents'].keys()

                    if keys is not None:
                        for l in keys:
                            if 'meta' in dict_event[k]['sents'][l]:
                                nouns += dict_event[k]['sents'][l]['meta']['nouns_not_matched']

        new_actor_meta['new_actor'] = list(set(nouns))



        #print new_actor_meta

        new_actor_freq = dict()
        #new_actor_freq['doc_id'] = new_actor_meta['doc_id']


        total_count = 0
        for item in new_actor_meta['new_actor']:
            sentences = json.load(StringIO(line), encoding='utf-8')

            count = 0
            ner = set()
            for s in sentences['sentences']:
                #"(MONEY,$|48|million),(ORGANIZATION,United|Nations),(DATE,30|August|2016|today),(NUMBER,1.3|million),(LOCATION,Central|Africa|West|Central|Africa),(PERSON,WFP/Daouda|Guirou)"

                ner_text_list = ''

                if len(s['ner']) > 0:
                    for ner_item in s['ner'].replace('),(', ':').split(':'):
                        ner_item_list = ner_item.replace('(', '').replace(')', '').split(',')

                        if len(ner_item_list) != 2:
                            continue


                        if ner_item_list[0] == 'PERSON': # or ner_item_list[0] == 'MISC' or ner_item_list[0] == 'ORGANIZATION':
                            ner_text_list = ner_item_list[1]
                            ner = ner | set([x.strip().upper() for x in ner_text_list.split('|')])
                            ner = ner - self.discard_words_set




                            #ner = ner | set([x.strip().upper() for x in s['ner'].replace('ORGANIZATION', '').replace('LOCATION', '').replace('PERSON', '').replace('MISC', '').replace('DATE', '').replace('(', '').replace(')', '').replace('|', ',').split(',')])
                            #ner = ner | set([x.strip().upper() for x in ner_text_list.split('|')])
                            #ner = ner - discard_words_set

            #print ner
            new_actor_count=0
            for s in sentences['sentences']:
                #if item in ner:
                content = s['sentence']
                if item in ner:
                    count += len(re.findall(item, content.upper()))

                    if self.actor_dict.contains(item):
                        continue

                    #TO_DO: find NP from tree: findNP(NPParseTreeHashMap, item)
                    new_actor_freq[item] = count
                    if(count > 0):
                        new_actor_count+= 1


        #print new_actor_freq

        new_actor = dict()
        print line
        new_actor['doc_id'] = new_actor_meta['doc_id']
        # try:
        #     new_actor['doc_id'] = new_actor_meta['doc_id']
        # except:
        #     new_actor['doc_id'] = "abcd"
        #
        # new_actor['new_actor'] = new_actor_freq

        #print  new_actor
        #if (new_actor_count > 0):

        print "new actor start"
        print new_actor
        print "new actor end"

        print "new actor list start"
        print self.new_actor_over_time
        print "new actor list end"

        self.total_new_actor_list.append(new_actor)


        with open('/Users/nikitakothari/Downloads/dataset_new/new_actor.txt', 'w') as outfile:
            json.dump(self.total_new_actor_list, outfile)


        total_document = 0.0

        for item in self.total_new_actor_list:
            #{"new_actor": {"DHUBULIA": 2, "PRIMARY": 11, "NADIA\u00c2": 1}, "doc_id": "india_telegraph_bengal20160922.0001"}
            total_count = 0.0
            if 'new_actor' in item and 'doc_id' in item:
                total_document += 1
                for k in item['new_actor'].keys():
                    total_count += item['new_actor'][k]

            for k in item['new_actor'].keys():
                tf = 1.00 * (item['new_actor'][k]/total_count)
                if k not in self.word_dic:
                    self.word_dic[k] = tf
                    self.word_dict_count[k] = 1
                else:
                    self.word_dic[k] += tf
                    self.word_dict_count[k] += 1



        for k in self.word_dic.keys():
            self.word_dic[k] = self.word_dic[k] * (self.word_dict_count[k]/total_document)


        word_dic_sorted = sorted(self.word_dic.items(), key=lambda x : (-x[1], x[0]))[:N]

        #with open('/root/Desktop/new_actor_td_df.txt', 'w') as outfile:
        #    json.dump(word_dic_sorted, outfile)

        for actor_item in  word_dic_sorted:
            actor_noun = actor_item[0]
            if actor_noun in self.new_actor_over_time:
                self.new_actor_over_time[actor_noun] += 1
            else:
                self.new_actor_over_time[actor_noun] = 1





        with open('/Users/nikitakothari/Downloads/dataset_new/new_actor_td_df.txt', 'w') as outfile:
            json.dump(sorted(self.new_actor_over_time.items(), key=lambda x : (-x[1], x[0])), outfile)
示例#6
0
def code_articles_(articleText, petrGlobals={}):
    coder = EventCoder(petrGlobal=petrGlobals)
    events_map = coder.gen_cameo_event(articleText)
    return str(events_map)
示例#7
0
def code_articles(articleText, petrGlobals={}):
    coder = EventCoder(petrGlobal=petrGlobals)
    events_map = coder.gen_cameo_event(map_articles(articleText))
    return str(events_map)


def code_articles_(articleText, petrGlobals={}):
    coder = EventCoder(petrGlobal=petrGlobals)
    events_map = coder.gen_cameo_event(articleText)
    return str(events_map)


if __name__ == "__main__":

    coder = EventCoder(petrGlobal={})

    #article = '{ "type" : "story", "doc_id" : "nytasiapacific20160622.0002", "head_line" : "Lightning Ridge Journal: An Amateur Undertaking in Australian Mining Town With No Funeral Home", "date_line" : "Tue, 21 Jun 2016 03:52:15 GMT", "sentences" : [ { "sentence_id" : 1, "sentence" : "A Tunisian court has jailed a Nigerian student for two years for helping young militants join an armed Islamic group in Lebanon, his lawyer said Wednesday.", "parse_sentence" : "(ROOT (S (S (NP (DT A) (NNP Tunisian) (NN court)) (VP (VBZ has) (VP (VBN jailed) (NP (DT a) (NNP Nigerian) (NN student)) (PP (IN for) (NP (NP (CD two) (NNS years)) (PP (IN for) (S (VP (VBG helping) (S (NP (JJ young) (NNS militants)) (VP (VB join) (NP (DT an) (JJ armed) (JJ Islamic) (NN group)) (PP (IN in) (NP (NNP Lebanon))))))))))))) (, ,) (NP (PRP$ his) (NN lawyer)) (VP (VBD said) (NP (NNP Wednesday))) (. .)))" } ], "corref" : "" }'

    article = """{
	"type":"story",
	"doc_id":"hindu_cities20160829.0009",
	"head_line":"Velankanni festival begins",
	"date_line" : "Tue, 21 Jun 2016 03:52:15 GMT",
	"sentences":[
		{
			"sentence_id":1,
			"sentence":"NAGAPATTINAM :TAMILNADU: 29/08/2016 : Our Lady of Health Velankanni Basilica Annual Flag hoisting, in progress, as thousands of devotees witnessing, in Nagapattinam district.Photo: B.Velankanni Raj  The 11-day annual feast of Shrine Basilica of Our Lady of Health, popularly called Annai Velankanni Matha, began in Velankanni on Monday evening with a huge procession and hoisting of the holy flag by Most Rev. M. Devadass Ambrose, Bishop of Thanjavur.",
			"parse_sentence":"(ROOT (NP (NP (NNP NAGAPATTINAM)) (: :) (NP (NP (NNP TAMILNADU)) (: :) (NP (NP (CD 29/08/2016)) (: :) (NP (NP (NP (PRP$ Our) (NN Lady)) (PP (IN of) (NP (NP (NNP Health) (NNP Velankanni) (NN Basilica) (JJ Annual) (NN Flag)) (VP (VBG hoisting) (, ,) (PP (IN in) (NP (NN progress))) (, ,) (SBAR (IN as) (S (NP (NP (NP (NP (NNS thousands)) (PP (IN of) (NP (NNS devotees)))) (VP (VBG witnessing) (, ,) (PP (IN in) (NP (NNP Nagapattinam) (NNP district.Photo))))) (: :) (NP (NP (NNP B.Velankanni) (NNP Raj) (NNP The) (JJ 11-day) (JJ annual) (NN feast)) (PP (IN of) (NP (NP (NNP Shrine) (NN Basilica)) (PP (IN of) (NP (NP (PRP$ Our) (NN Lady)) (PP (IN of) (NP (NNP Health))))))) (, ,) (VP (ADVP (RB popularly)) (VBN called) (S (NP (NNP Annai) (NNP Velankanni) (NNP Matha)))) (, ,))) (VP (VBD began) (PP (IN in) (NP (NNP Velankanni))) (PP (IN on) (NP (NP (NNP Monday) (NN evening)) (PP (IN with) (NP (NP (DT a) (JJ huge) (NN procession)) (CC and) (NP (NP (NN hoisting)) (PP (IN of) (NP (DT the) (JJ holy) (NN flag))) (PP (IN by) (NP (JJS Most) (NNP Rev.) (NNP M.) (NNP Devadass) (NNP Ambrose))))))))))))))) (, ,) (NP (NP (NNP Bishop)) (PP (IN of) (NP (NNP Thanjavur))))))) (. .)))",
			"token":"NAGAPATTINAM,:,TAMILNADU,:,29/08/2016,:,Our,Lady,of,Health,Velankanni,Basilica,Annual,Flag,hoisting,,,in,progress,,,as,thousands,of,devotees,witnessing,,,in,Nagapattinam,district.Photo,:,B.Velankanni,Raj,The,11-day,annual,feast,of,Shrine,Basilica,of,Our,Lady,of,Health,,,popularly,called,Annai,Velankanni,Matha,,,began,in,Velankanni,on,Monday,evening,with,a,huge,procession,and,hoisting,of,the,holy,flag,by,Most,Rev.,M.,Devadass,Ambrose,,,Bishop,of,Thanjavur,.",
			"lemma":"NAGAPATTINAM,:,TAMILNADU,:,29/08/2016,:,we,lady,of,Health,Velankanni,basilica,annual,flag,hoist,,,in,progress,,,as,thousand,of,devotee,witness,,,in,Nagapattinam,district.Photo,:,B.Velankanni,Raj,The,11-day,annual,feast,of,Shrine,basilica,of,we,lady,of,Health,,,popularly,call,Annai,Velankanni,Matha,,,begin,in,Velankanni,on,Monday,evening,with,a,huge,procession,and,hoisting,of,the,holy,flag,by,most,Rev.,M.,Devadass,Ambrose,,,Bishop,of,Thanjavur,.",
示例#8
0
  
from EventCoder import EventCoder
 
coder = EventCoder(petrGlobal={}) 
 
another_coder = EventCoder(petrGlobal=coder.get_PETRGlobals())
  
input_file = open('core_nlp_out.txt')
  

  
for line in input_file:
      
    print line      
    print '==================='
      
    print another_coder.encode(line)

# from dateutil import parser
# from datetime import datetime
# 
# dateObject = parser.parse("")
# 
# article_date = datetime.strftime(dateObject, '%Y%m%d') 
# 
# 
# print article_date 
示例#9
0
from EventCoder import EventCoder

coder = EventCoder(petrGlobal={})

another_coder = EventCoder(petrGlobal=coder.get_PETRGlobals())

input_file = open('core_nlp_out.txt')

for line in input_file:

    print line
    print '==================='

    print another_coder.encode(line)

# from dateutil import parser
# from datetime import datetime
#
# dateObject = parser.parse("")
#
# article_date = datetime.strftime(dateObject, '%Y%m%d')
#
#
# print article_date