Пример #1
0
def test_complex1():

    text = "A Tunisian court has jailed a Nigerian student for two years for helping young militants join an armed Islamic group in Lebanon, his lawyer said Wednesday."

    parse = """( (S (S
    (NP (DT A) (NNP Tunisian) (NN court))
    (VP (AUXZ has)
    (VP (VBN jailed)
    (NP (DT a) (JJ Nigerian) (NN student))
    (PP (IN for)
    (NP (CD two) (NNS years)))
    (PP (IN for) (S
    (VP (VBG helping) (S
    (NP (JJ young) (NNS militants))
    (VP (VB join)
    (NP (NP (DT an) (JJ armed) (JJ Islamic) (NN group))f
    (PP (IN in)
    (NP (NNP Lebanon)))))))))))) (, ,)
    (NP (PRP$ his) (NN lawyer))
    (VP (VBD said)
    (NP (NNP Wednesday))) (. .)))"""


    parsed = utilities._format_parsed_str(parse)

    dict = {u'test123': {u'sents': {u'0': {u'content': text, u'parsed': parsed}},
                u'meta': {u'date': u'20010101'}}}
    return_dict = petrarch2.do_coding(dict,None)
    print(return_dict)
    assert return_dict['test123']['sents']['0']['events'] == [('TUNJUD','NGAEDU','173')]
Пример #2
0
    def get_phrases(self, text, parse):
        parsed = utilities._format_parsed_str(parse)

        ddict = {u'test123':
                {u'sents': {u'0': {u'content': text, u'parsed': parsed}},
                 u'meta': {u'date': u'20010101'}}}
        return_dict = petrarch2.do_coding(ddict, None)
        
        n = return_dict['test123']['meta']['verbs']['nouns']
        nouns = [i[0] for i in n]
        noun_coding = [i[1] for i in n]
        try:
            verbs = return_dict['test123']['meta']['verbs']['eventtext'].values()[0]
        except KeyError:
            print "No eventtext"
            verbs = ""
        try:
            verb_coding = return_dict['test123']['meta']['verbs']['eventtext'].keys()[0][2]
        except KeyError as e:
            print e
            verb_coding = ""
        phrase_dict = {"nouns" : nouns,
                       "noun_coding" : noun_coding,
                      "verbs" : verbs,
                      "verb_coding" : verb_coding}
        return(phrase_dict)
Пример #3
0
def test_complex1():

    text = "A Tunisian court has jailed a Nigerian student for two years for helping young militants join an armed Islamic group in Lebanon, his lawyer said Wednesday."

    parse = """( (S (S
    (NP (DT A) (NNP Tunisian) (NN court))
    (VP (AUXZ has)
    (VP (VBN jailed)
    (NP (DT a) (JJ Nigerian) (NN student))
    (PP (IN for)
    (NP (CD two) (NNS years)))
    (PP (IN for) (S
    (VP (VBG helping) (S
    (NP (JJ young) (NNS militants))
    (VP (VB join)
    (NP (NP (DT an) (JJ armed) (JJ Islamic) (NN group))f
    (PP (IN in)
    (NP (NNP Lebanon)))))))))))) (, ,)
    (NP (PRP$ his) (NN lawyer))
    (VP (VBD said)
    (NP (NNP Wednesday))) (. .)))"""


    parsed = utilities._format_parsed_str(parse)

    dict = {u'test123': {u'sents': {u'0': {u'content': text, u'parsed': parsed}},
                u'meta': {u'date': u'20010101'}}}
    return_dict = petrarch2.do_coding(dict,None)
    print(return_dict)
    assert return_dict['test123']['sents']['0']['events'] == [('TUNJUD','NGAEDU','173')]
Пример #4
0
def parse_sentence(stanford_parser, date, text):

    nlp = stanford_parser.parse_doc(text)
    parse = nlp['sentences'][0]['parse']
    parsed = utilities._format_parsed_str(parse)

    dict = {
        u'doc': {
            u'sents': {
                u'0': {
                    u'content': text,
                    u'parsed': parsed
                }
            },
            u'meta': {
                u'date': date
            }
        }
    }

    return_dict = petrarch2.do_coding(dict)

    has_events = False

    if "events" in return_dict['doc']['sents']['0']:
        #events = return_dict['doc']['sents']['0']['events']
        has_events = True

    return_dict['has_events'] = has_events
    return_dict['sentence_date'] = date

    print(str(return_dict))

    return return_dict
Пример #5
0
def test_simple2():
    text = "Germany arrested France"
    parse = "(ROOT (S (NP (NNP Germany)) (VP (VBD arrested) (NP (NNP France)))))"
    parsed = utilities._format_parsed_str(parse)

    dict = {u'test123': {u'sents': {u'0': {u'content': text, u'parsed': parsed}},
                u'meta': {u'date': u'20010101'}}}

    return_dict = petrarch2.do_coding(dict,None)
    print(return_dict)
    assert return_dict['test123']['sents']['0']['events'] == [('DEU','FRA','173')]
Пример #6
0
def test_simple2():
    text = "Germany arrested France"
    parse = "(ROOT (S (NP (NNP Germany)) (VP (VBD arrested) (NP (NNP France)))))"
    parsed = utilities._format_parsed_str(parse)

    dict = {u'test123': {u'sents': {u'0': {u'content': text, u'parsed': parsed}},
                u'meta': {u'date': u'20010101'}}}

    return_dict = petrarch2.do_coding(dict,None)
    print(return_dict)
    assert return_dict['test123']['sents']['0']['events'] == [('DEU','FRA','173')]
Пример #7
0
def get_phrases(doc):
    phrases_output=[]
    nouns=[]
    noun_coding=[]
    verbs=[]
    verb_coding=[]
    return_dict={}
    article_id=doc[0]
    date=date_formatter(doc[1])
    #logger.info('Date: '+date)
    doc_id=doc[3]
    corenlpJsonData=json.loads(doc[2])
    sentences=corenlpJsonData['sentences']
    for sentence in sentences:
        sen_phrases_dict = {}
        sentenceJson= json.loads(json.dumps(sentence))
        sentenceId=sentenceJson['sen_id']
        sentenceTree=sentenceJson['tree']
        sentenceData=sentenceJson['sentence']
        parsed=utilities._format_parsed_str(sentenceTree)
        dict = {doc_id: {u'sents': {sentenceId: {u'content': sentenceData, u'parsed': parsed}}, u'meta': {u'date': date.encode()}}}
        try:
            return_dict = petrarch2.do_coding(dict)
            n = return_dict[doc_id]['meta']['verbs']['nouns']
            nouns = [i[0] for i in n]
            noun_coding = [i[1] for i in n]
            verbs = return_dict[doc_id]['meta']['verbs']['eventtext'].values()[0]
        except:
            print "No eventtext"
            verbs = ""
        try:
            verb_coding = return_dict[doc_id]['meta']['verbs']['eventtext'].keys()[0][2]
        except KeyError as e:
            print e
            verb_coding = ""
        phrase_dict = {"nouns": nouns,
                       "noun_coding": noun_coding,
                       "verbs": verbs,
                       "verb_coding": verb_coding}
	sen_phrases_dict={sentenceData: phrase_dict}
        phrases_output.append(sen_phrases_dict)
        #print phrases_output
    return (article_id, json.dumps(phrases_output),doc_id)
def read_json_(article_main):
    holding = {}
    sentence_limit = 7
    article = None

    dateObject = None
    if ('date' not in article_main) or (len(article_main['date']) == 0):
        dateObject = datetime.now()
    else:
        dateObject = str(article_main['date'])[:-3]

    article = json.load(StringIO(article_main['output']), encoding='utf-8')

    try:
        entry_id = str(article['doc_id'])
        sent_dict = {}
        article_date = datetime.fromtimestamp(long(dateObject)).strftime('%Y%m%d')

        #datetime.strftime(dateObject, '%Y%m%d')
        meta_content = {'date': article_date}
        counter = 0

        for sentence in article['sentences']:
            counter = counter + 1
            sent_id = str(counter)

            if counter == sentence_limit:
                break #read only the first 7 sentences of a article
            parsed_text = utilities._format_parsed_str(sentence['parse_sentence'])
            sent_dict[sent_id] = {'content': sentence['sentence'], 'parsed':
                                        parsed_text, 'geo-location': json.load(StringIO(get_geo_location(sentence['sentence'])), encoding='utf-8')}
        content_dict = {'sents': sent_dict, 'meta': meta_content}
        holding[entry_id] = content_dict
        return holding
    except Exception as e:
        print(e.message)
        print(sys.exc_info()[0])

        return {}
def read_json(jsonString):
    holding = {}
    sentence_limit = 7
    article = None
    try:
        article = json.load(StringIO(jsonString), encoding='utf-8')
    except:
        print("Error while PARSING \n"+jsonString)

    dateObject = None
    if len(article['date_line']) == 0:
        dateObject = datetime.now()
    else:    
        dateObject = parser.parse(article['date_line']) 

    
    try:
        entry_id = str(article['doc_id'])
        sent_dict = {}
        article_date = datetime.strftime(dateObject, '%Y%m%d')
        meta_content = {'date': article_date, 'headline': article['head_line']}
        counter = 0
    
        for sentence in article['sentences']:
            sent_id = str(sentence['sentence_id'])
            counter = counter + 1
            if counter == sentence_limit:
                break #read only the first 7 sentences of a article
            parsed_text = utilities._format_parsed_str(sentence['parse_sentence'])
            sent_dict[sent_id] = {'content': sentence['sentence'], 'parsed':
                                        parsed_text, 'geo-location': json.load(StringIO(get_geo_location(sentence['sentence'])), encoding='utf-8')}
        content_dict = {'sents': sent_dict, 'meta': meta_content}
        holding[entry_id] = content_dict
        return holding
    except:
        print('Invalid JSON Format')
        print(sys.exc_info()[0])
        
        return {}
Пример #10
0
def test_simple():

    text = "Germany invaded France"
    parse = "(ROOT (S (NP (NNP Germany)) (VP (VBD invaded) (NP (NNP France)))))"
    parsed = utilities._format_parsed_str(parse)

    dict = {
        u'test123': {
            u'sents': {
                u'0': {
                    u'content': text,
                    u'parsed': parsed
                }
            },
            u'meta': {
                u'date': u'20010101'
            }
        }
    }

    return_dict = petrarch2.do_coding(dict)
    print(return_dict)
    events = return_dict['test123']['sents']['0']['events']
    print(events)
Пример #11
0
rows= c.fetchall()

for row in rows:
    phrases_output=[]
    date=date_formatter(row[1])
    doc_id=row[3]
    corenlpJsonData=json.loads(row[2])
    sentences=corenlpJsonData['sentences']
    for sentence in sentences:
        sen_phrases_dict = {}
        sentenceJson= json.loads(json.dumps(sentence))
        sentenceId=sentenceJson['sen_id']
        sentenceTree=sentenceJson['tree']
        sentenceData=sentenceJson['sentence']
        parsed=utilities._format_parsed_str(sentenceTree)
        dict = {doc_id: {u'sents': {sentenceId: {u'content': sentenceData, u'parsed': parsed}}, u'meta': {u'date': date.encode()}}}
        try:
            return_dict = petrarch2.do_coding(dict, None)
            n = return_dict[doc_id]['meta']['verbs']['nouns']
            nouns = [i[0] for i in n]
            noun_coding = [i[1] for i in n]
            verbs = return_dict[doc_id]['meta']['verbs']['eventtext'].values()[0]
        except:
            print "No eventtext"
            verbs = ""
        try:
            verb_coding = return_dict[doc_id]['meta']['verbs']['eventtext'].keys()[0][2]
        except KeyError as e:
            print e
            verb_coding = ""
Пример #12
0
    sen_out_dict= {}
    phrases_out_dict={}
    sen_out_records= []
    sent_phrases_array={}
    sentences=data_json['sentences']
    sen_failed=0;
    sen_parsed=0;
    for sentence in sentences:
		sen_dump=json.dumps(sentence)
		sen_json=json.loads(sen_dump)
		sen_id=sen_json['sen_id'].encode()
		sen_data=sen_json['sentence']
		sen_parse=sen_json['tree']
		text=sen_data
		parse=sen_parse
		parsed = utilities._format_parsed_str(parse)

                try:
			        py_logger.debug('parsing : '+mongo_id)
			        dict = {mongo_id: {u'sents': {sen_id: {u'content': text, u'parsed': parsed}},u'meta': {u'date': date.encode()}}}
			        return_dict = petrarch2.do_coding(dict,None)
			        return_dict= json.dumps(return_dict)
			        #print return_dict
			        output=return_dict
			        sen_out_records.append(output)
			        sen_parsed= sen_parsed+1;
				sent_phrases_array[text]= get_phrases('', text, parsed)

                except:
			        sen_failed=sen_failed+1;
			        py_logger.error('Parsing failed: '+mongo_id+' sen_id: '+sen_id)