示例#1
0
import codecs
import json
import func

if len(argv) < 5:
    print """
	./addPredictionPSArg1.py <relations.json> <parses.json> <predictionFile>  <writeF>	
	"""
    exit()
relations = argv[1]  #relations.json
predictedClauses = open(argv[3], 'r')
parsesFile = argv[2]
outF = open(argv[4], 'w')
parsesF = codecs.open(parsesFile, encoding='utf8')
parseDict = json.load(parsesF)
dictByDocID = func.makeDictByDocID(parseDict)
relationsF = codecs.open(relations, encoding='utf8')
relations = [json.loads(x) for x in relationsF]


def writeOutputFormat(relations, outF):
    for relation in relations:
        outF.write('%s\n' % json.dumps(relation))


def makeRelationsDictForOutput(relationDict, psArg1PredictionsDict,
                               dictByDocID):
    #relations = [json.loads(x) for x in relationsF]
    total = 0
    correct = 0.0
    for relation in relationDict:
def produceNonExplicitRelationCandidates(predictionsF, parsesF, docDir):
	relations = []	
	print("In function: produceNonExplicitRelationCandidates");
	
	#Read predicted relations.json
	print("Reading predicted relations.json");
	pdtb_file = codecs.open(predictionsF, encoding='utf8');
	predictions = [json.loads(x) for x in pdtb_file];
	print("Done");
	
	#Read parses.json
	print ("Reading parses.json");
	parse_file = codecs.open(parsesF, encoding='utf8')
	parses = json.load(parse_file)
	print ("Done");
	
	dictByDocID=func.makeDictByDocID(parses)
	
	nonMatchNum = 0
	for DocID in parses.keys():
		#print
		relationCount = 0
		senList = parses[DocID]['sentences']
		toCheckParagraph = False
		senLineDict = createSentenceLineDict(DocID, docDir)
		if len(senList) == len(senLineDict):
			toCheckParagraph = True
		else:
			nonMatchNum += 1
			#print "Doc [" + DocID + "]: Sentence count in parses.json <" + str(len(senList)) + "> does NOT match the count in raw file <" + str(len(senLineDict)) + ">"
		
		for sen1ID in range (0, len(senList)-1):
			sen2ID = sen1ID + 1
			#Extract a adjacent sentences pair
			sen1 = senList[sen1ID]
			sen2 = senList[sen2ID]
			
			#Check whether they are in different paragraphs
			if toCheckParagraph:				
				if not inSameParagraph(sen1ID, sen2ID, senLineDict):
					continue	
			
			#Check whether a sentence pair already has explicit relation
			if isExplicitRelation(sen1ID, sen2ID, DocID, predictions):
				continue
			
			#For valid sentence pairs, create a relation
			relation = {}
			
			#do DocID
			relation['DocID'] = DocID
			
			#do Arg1
			relation['Arg1'] = extractArgFields(DocID, sen1, sen1ID, dictByDocID)
			#do Arg2
			relation['Arg2'] = extractArgFields(DocID, sen2, sen2ID, dictByDocID)
			
			#Append relation
			relationCount += 1
			relations.append(relation)
		#print 'Create ' + str(relationCount) + ' non-explicit relations out of ' + str(len(senList)) + ' sentences from Doc [' + DocID + ']'
	
	print '\n' + str(len(relations)) + ' Non-Explicit relations created in total'
	print str(nonMatchNum) + '/' + str(len(parses)) + ' documents have inconsistent sentence counts in parses.json and raw files'
	
	#add explicit relations
	for relation in predictions:
		relations.append(relation)
	print '\nExporting ' + str(len(relations)) + ' relations (both explicit and non-explicit) in total'
	return relations
def produceNonExplicitRelationCandidates(predictionsF, parsesF, docDir):
    relations = []
    print("In function: produceNonExplicitRelationCandidates")

    #Read predicted relations.json
    print("Reading predicted relations.json")
    pdtb_file = codecs.open(predictionsF, encoding='utf8')
    predictions = [json.loads(x) for x in pdtb_file]
    print("Done")

    #Read parses.json
    print("Reading parses.json")
    parse_file = codecs.open(parsesF, encoding='utf8')
    parses = json.load(parse_file)
    print("Done")

    dictByDocID = func.makeDictByDocID(parses)

    nonMatchNum = 0
    for DocID in parses.keys():
        #print
        relationCount = 0
        senList = parses[DocID]['sentences']
        toCheckParagraph = False
        senLineDict = createSentenceLineDict(DocID, docDir)
        if len(senList) == len(senLineDict):
            toCheckParagraph = True
        else:
            nonMatchNum += 1
            #print "Doc [" + DocID + "]: Sentence count in parses.json <" + str(len(senList)) + "> does NOT match the count in raw file <" + str(len(senLineDict)) + ">"

        for sen1ID in range(0, len(senList) - 1):
            sen2ID = sen1ID + 1
            #Extract a adjacent sentences pair
            sen1 = senList[sen1ID]
            sen2 = senList[sen2ID]

            #Check whether they are in different paragraphs
            if toCheckParagraph:
                if not inSameParagraph(sen1ID, sen2ID, senLineDict):
                    continue

            #Check whether a sentence pair already has explicit relation
            if isExplicitRelation(sen1ID, sen2ID, DocID, predictions):
                continue

            #For valid sentence pairs, create a relation
            relation = {}

            #do DocID
            relation['DocID'] = DocID

            #do Arg1
            relation['Arg1'] = extractArgFields(DocID, sen1, sen1ID,
                                                dictByDocID)
            #do Arg2
            relation['Arg2'] = extractArgFields(DocID, sen2, sen2ID,
                                                dictByDocID)

            #Append relation
            relationCount += 1
            relations.append(relation)
        #print 'Create ' + str(relationCount) + ' non-explicit relations out of ' + str(len(senList)) + ' sentences from Doc [' + DocID + ']'

    print '\n' + str(
        len(relations)) + ' Non-Explicit relations created in total'
    print str(nonMatchNum) + '/' + str(
        len(parses)
    ) + ' documents have inconsistent sentence counts in parses.json and raw files'

    #add explicit relations
    for relation in predictions:
        relations.append(relation)
    print '\nExporting ' + str(len(
        relations)) + ' relations (both explicit and non-explicit) in total'
    return relations
示例#4
0
   features.append('prevLastPOS:'+prevLastPOS)
   features.append('nextFirstPOS:'+nextFirstPOS)
   features.append('prevLastAndComma:'+prevLast+'_'+commaBefore)
   features.append('nextFirstAndComma:'+nextFirst+'_'+commaAfter)
   features.append('commaAndcurFirstWord:'+commaBefore+'_'+curFirstWord)
   features.append('curLastWordAndComma:'+curLastWord+'_'+commaAfter)
   features.append('commaAndcurFirstPOS:'+commaBefore+'_'+curFirstPOS)
   features.append('curLastPOSAndComma:'+curLastPOS+'_'+commaAfter)
   features.append('verb1:'+v1)
   features.append('verb2:'+v2)
   features.append('verb3:'+v3)
   return features

   
	
	
	
dictByDocID=func.makeDictByDocID(parseDict)
#func.getAllVerbsFromData(dictByDocID)
#exit()
dictByTokenID=func.makeDictByTokenID(dictByDocID)
#for i in dictByDocID:
#	print >>stderr, i, dictByDocID[i].keys()
#print >>stderr, dictByDocID.keys()
#print >>stderr, len(relations)
#exit()
relationDict=func.makeRelationDict(relations)
makeDataForArg1PSExplicit(dictByDocID,dictByTokenID,parseDict,relationDict,verbList,relations,outF)
#makeDataForImplicitSenseGold(dictByDocID,relationDict,outF) #will use gold arguments for implicit relations
#exit()