def main(): scriptsDatabase = [] onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))] for file in onlyfiles: fileNoExtension = os.path.splitext(file)[0] # fileNoExtension = 'bolt-eng-DF-170-181125-9125545' # file = 'bolt-eng-DF-170-181125-9125545.ann' logging.debug("This is the file I am processing: %s" % fileNoExtension) if os.path.splitext(file)[1] == '.ann': script = event.Script(fileNoExtension) eventsTagsPairList = getListOfAfterLinks(script) newLinks = getListOfExtraAfterLinks(script) eventsTagsPairList = eventsTagsPairList + newLinks logging.debug("This is the list of unordered events: %s", eventsTagsPairList) orderedEventsLists = createEventsClusters(eventsTagsPairList) logging.debug("This is the list of ordered events: %s", orderedEventsLists) for oel in orderedEventsLists: eventList = oel[0] eventsSequenceList = [] n = 1 for ev in eventList: eventType, eventSubtype, eventTextRef, textValue, realisType, realisValue = script.getEventDetailsByEventId(ev[0]) eventsSequenceList.append(eventType + '-' + eventSubtype) eventType, eventSubtype, eventTextRef, textValue, realisType, realisValue = script.getEventDetailsByEventId(ev[1]) eventsSequenceList.append(eventType + '-' + eventSubtype) scriptsDatabase.append([script.scriptName, eventsSequenceList]) scripts = ['#'.join(x[1]) for x in scriptsDatabase] counts = Counter(scripts) print(counts) # logging.debug("This is the total number of files %d", len(onlyfiles)) # script = event.Script('bolt-eng-DF-170-181125-9125545') eventsTagsPairList = getListOfAfterLinks(script) # orderedEventsLists = createEventsClusters(eventsTagsPairList) print(orderedEventsLists) distances = getAverageStoryEventPairsDistance() print(distances) print(numpy.mean(distances)) oel = computeOrderedSequenceList() probabilities = computePairsProbabilities() coreferences = getCoreferenceLinks() print(1)
def computePairsProbabilities(): """ This procedure takes all the events with after links from a newsarticle file. It then creates the ordered events clusters (pairs of events), and then it computes the statistics on the events type-subtype for instance: 7-3e7-0 means type 7-subtype 4 goes to type 7-subtype 0 (as per the constants file). It turns out that there are only 133 pairs of possible events with after links in the whole corpus. :return: """ onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))] allGoodEventPairs = [] for file in onlyfiles: fileNoExtension = os.path.splitext(file)[0] logging.debug("This is the file I am processing: %s" % fileNoExtension) if os.path.splitext(file)[1]: script = event.Script(fileNoExtension) eventsTagsPairList = getListOfExtraAfterLinks(script) logging.debug("This is the list of unordered events: %s", eventsTagsPairList) orderedEventsLists = createEventsClusters(eventsTagsPairList) logging.debug("This is the list of ordered events: %s", orderedEventsLists) for oel in orderedEventsLists: eventList = oel[0] n = 1 for ev in eventList: eventsPair = [] try: eventType, eventSubtype, eventTextRef, textValue, realisType, realisValue = script.getEventDetailsByEventId(ev[0]) except: logging.info('Scramble issue with %s', script.scriptName) logging.debug("Events: %s, %s", eventType, eventSubtype) eventTypeNumber = c.EVENTTYPES[eventType] eventSubTypeNumber = c.EVENTSUBTYPES[eventTypeNumber].index(eventSubtype) eventPairString1 = str(eventTypeNumber) + '-' + str(eventSubTypeNumber) try: eventType, eventSubtype, eventTextRef, textValue, realisType, realisValue = script.getEventDetailsByEventId(ev[1]) except: logging.info('Scramble issue with %s', script.scriptName) logging.debug("Events: %s, %s", eventType, eventSubtype) eventTypeNumber = c.EVENTTYPES[eventType] eventSubTypeNumber = c.EVENTSUBTYPES[eventTypeNumber].index(eventSubtype) eventPairString2 = str(eventTypeNumber)+'-'+str(eventSubTypeNumber) eventPairString = eventPairString1+'e'+eventPairString2 allGoodEventPairs.append(eventPairString) counts = Counter(allGoodEventPairs) keys = counts.keys() numberOfEvents = len(allGoodEventPairs) probabilities = {} for key in keys: probabilities[key] = counts[key]/float(numberOfEvents) return counts, probabilities
def computeOrderedSequenceList(): """ This procedure returns a list of ordered events :return: """ onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))] for file in onlyfiles: fileNoExtension = os.path.splitext(file)[0] logging.debug("This is the file I am processing: %s" % fileNoExtension) if os.path.splitext(file)[1]: script = event.Script(fileNoExtension) eventsTagsPairList = getListOfAfterLinks(script) logging.debug("This is the list of unordered events: %s", eventsTagsPairList) orderedEventsLists = createEventsClusters(eventsTagsPairList) logging.debug("This is the list of ordered events: %s", orderedEventsLists) return orderedEventsLists
def getCoreferenceLinks(): onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))] i = 0 for file in onlyfiles: fileNoExtension = os.path.splitext(file)[0] logging.debug("This is the file I am processing: %s" % fileNoExtension) if os.path.splitext(file)[1]: script = event.Script(fileNoExtension) # <type 'list'>: [[[['E225', 'E459']], 0], [[['E446', 'E407']], 0]] eventsTagsPairList = getListOfAfterLinks(script) coreferenceLinks = script.getListOfCoreferenceClusters() logging.debug("This is the list of unordered events: %s", eventsTagsPairList) for eventPair in eventsTagsPairList: i = i+1 logging.info("Event1: %d %s", i, eventPair[0][0][0]) logging.info("Event2: %d %s", i, eventPair[0][0][1]) return 1
def checkLinksConsistency(self): incons = 0 articleName = self.articleName.replace(self.path, "") articleName = articleName.replace(".txt", "") script = ev.Script(articleName) eventsList = script.getListOfEventsWithAfterLinks() logging.debug("This is the list of events: %s", eventsList) eventsPositions = {} for event in eventsList: eventId = event.eventTag logging.debug("This is the start and stop: %s", event.textStartStop) # eventPosition = article.getSentenceNumberFromEvent(event) eventPosition = self.getTreePositionFromEvent(event) logging.debug("The sentence is at this position: %s\n\n\n", eventPosition) eventsPositions[eventId] = eventPosition clusters = script.eventsClusters logging.debug("This is the positions of the events: %s", eventsPositions) logging.debug("These are the events clusters: %s", clusters) clusterPositionsList = [] for cluster in clusters: positionList = [] for eventPair in cluster[0]: ev1 = eventPair[0] ev2 = eventPair[1] pos1 = eventsPositions[ev1] pos2 = eventsPositions[ev2] positionList.append(pos1) positionList.append(pos2) clusterPositionsList.append(positionList) logging.debug("These are the position lists: %s", clusterPositionsList) for posList in clusterPositionsList: #for position in posList: unique = set(posList) if len(unique) > 1: incons = 1 logging.debug("I have found an exception, ") return incons
def __init__(self, articleName, path): self.path = path self.articleNameNoPath = articleName self.articleName = path + articleName logging.debug(self.articleName) scriptName = articleName.replace(".txt", "") self.script = ev.Script(scriptName) # The following is the list of all the event objects in the script. self.articleEventsList = self.script.eventsList self.articleTree = etree.parse(self.articleName) self.articleType = self.getArticleType() if self.articleType == "story": self.docRoot = self.articleTree.xpath("//DOC[@type='story']") self.headline = self.getStoryHeadline() self.sentences = self.getStorySentences() else: self.docRoot = self.articleTree.getroot() #self.posts = self.getPosts() logging.debug("This is the file name: %s", articleName) with open(path + articleName) as f: lines = f.read().splitlines() fileAsCharactersList = '\n'.join(lines) self.fileAsCharactersList = re.sub(r'[^\x00-\x7F]+', ' ', fileAsCharactersList)
def main(): parserName = 'parseroutput-'+const.ANNDIR[0:-1]+'.tbf' with open(parserName, 'r') as po: lines = po.readlines() lines = [l.rstrip() for l in lines] probs = stats.computePairsProbabilities() countersProbs = probs[0] documents = {} first = True parserNameOut = 'postprocessoutput'+const.ANNDIR[0:-1]+'.tbf' with open(parserNameOut, 'w') as ppo: for line in lines: m = re.match('^#BeginOfDocument (.*)$', line) c = re.match('^@Coreference\tR[0-9]+\t(.*)$', line) a = re.match('^@After\tR[0-9]+\t(E[0-9]+),(E[0-9]+)$', line) if m: ppo.write("%s\n" % line) if documents == {} and first == True: key = m.group(1) script = ev.Script(key) first = False list = [] cList = [] caList = [] else: documents[key] = list key = m.group(1) script = ev.Script(key) list = [] cList = [] caList = [] elif c: clusterGroup = c.group(1) clusterGroup = clusterGroup.split(',') cList.append(clusterGroup) ppo.write("%s\n" % line) elif a: skipPair = False event1 = script.getEventByEventId(a.group(1)) event2 = script.getEventByEventId(a.group(2)) eventsPairScript = createEventsPairScript(event1,event2) event1cg = getClusterGroup(cList, a.group(1)) event2cg = getClusterGroup(cList, a.group(2)) logging.debug("These are the cluster groups: %s %s", event1cg, event2cg) if event1cg != -1 and event2cg != -1: logging.debug("Found a pair: %s %s, for the document %s", event1cg, event2cg, key) logging.debug("This is the current list: %s", caList) if(event1cg == event2cg): skipPair = True elif ((event1cg, event2cg) in caList): skipPair = True else: skipPair = False caList.append((event1cg, event2cg)) caList.append((event2cg, event1cg)) if (event1cg == -1) ^ (event2cg == -1): logging.debug("Found a pair: %s %s, for the document %s", event1cg, event2cg, key) logging.debug("This is the current list: %s", caList) if(event1cg == -1): event1cg = a.group(1) else: event2cg = a.group(2) if ((event1cg, event2cg) in caList): skipPair = True else: skipPair = False caList.append((event1cg, event2cg)) caList.append((event2cg, event1cg)) if eventsPairScript in countersProbs: if ((a.group(1), a.group(2)) not in list and (a.group(2), a.group(1)) not in list and skipPair == False): ppo.write("%s\n" % line) list.append((a.group(1), a.group(2))) else: ppo.write("%s\n" % line)