Exemplo n.º 1
0
 def _markNamedEntities(self):
     """
     This method is used to define which tokens belong to _named_ entities.
     Named entities are sometimes masked when testing learning of interactions, to
     prevent the system making a trivial decision based on commonly interacting names.
     """
     self.tokenIsName = {}
     self.tokenIsEntity = {}
     self.tokenIsEntityHead = {}
     # Initialize the dictionaries
     for token in self.tokens:
         self.tokenIsName[token] = False
         self.tokenIsEntity[token] = False
         self.tokenIsEntityHead[token] = []
     for entity in self.entities:
         entityOffsets = Range.charOffsetToTuples(entity.get("charOffset"))
         entityHeadOffset = Range.charOffsetToSingleTuple(entity.get("headOffset"))
         for token in self.tokens:
             tokenOffset = Range.charOffsetToSingleTuple(token.get("charOffset"))
             for entityOffset in entityOffsets:
                 if Range.overlap(entityOffset, tokenOffset):
                     self.tokenIsEntity[token] = True
                     if entity.get("isName") != None:
                         if entity.get("isName") == "True":
                             self.tokenIsName[token] = True
                     else:
                         entity.set("isName", "True")
                         self.tokenIsName[token] = True
             if Range.overlap(entityHeadOffset, tokenOffset):
                 self.tokenIsEntityHead[token].append(entity)
Exemplo n.º 2
0
def writeProteins(document, inputCorpus, outputFile=None):
    entityMap = {}
    offsetMap = {}
    triggerMap = {}
    for sentenceElement in document.findall("sentence"):
        sentence = inputCorpus.sentencesById[sentenceElement.get("id")]
        sentenceOffset = Range.charOffsetToSingleTuple(
            sentenceElement.get("charOffset"))
        for entity in sentence.entities:
            if entity.get("isName") == "True":
                origId = entity.get("origId").split(".")[-1]
                origIdNumber = int(origId[1:])
                assert (origIdNumber not in entityMap.keys())
                entityMap[origIdNumber] = entity

                entityOffset = Range.charOffsetToSingleTuple(
                    entity.get("charOffset"))
                offsetMap[origIdNumber] = getGeniaOffset(
                    sentenceOffset, entityOffset)
                triggerMap[entity.get("id")] = origId
    for key in sorted(entityMap.keys()):
        entity = entityMap[key]
        if outputFile != None:
            outputFile.write(
                encode(triggerMap[entity.get("id")] + "\tProtein " +
                       str(offsetMap[key][0]) + " " + str(offsetMap[key][1]) +
                       "\t" + entity.get("text") + "\n"))
    return triggerMap
Exemplo n.º 3
0
def makeDETSubPhrases(phrases, tokens, phraseDict, filter=None):
    newPhrases = []
    for phrase in phrases:
        if filter != None and phrase.get("type") not in filter:
            continue
        phraseOffset = Range.charOffsetToSingleTuple(phrase.get("charOffset"))
        phraseBegin = int(phrase.get("begin"))
        phraseEnd = int(phrase.get("end"))
        # Drop DT
        if phraseBegin > 0 and tokens[phraseBegin].get("POS") == "DT":
            newPhraseOffset = (Range.charOffsetToSingleTuple(tokens[phraseBegin+1].get("charOffset"))[0], phraseOffset[1])
            newPhrase = makePhrase("DT(-)-" + phrase.get("type"),
                      newPhraseOffset, 
                      phraseBegin + 1, 
                      phraseEnd)
            if not phraseDict.has_key(newPhraseOffset):
                #print "NEW PHRASE:", ETUtils.toStr(newPhrase)
                newPhrases.append(newPhrase)
                phraseDict[newPhraseOffset] = [newPhrase]
        # Add DT
        if phraseBegin > 0 and tokens[phraseBegin-1].get("POS") == "DT":
            newPhraseOffset = (Range.charOffsetToSingleTuple(tokens[phraseBegin-1].get("charOffset"))[0], phraseOffset[1])
            newPhrase = makePhrase("DT(+)-" + phrase.get("type"),
                      newPhraseOffset, 
                      phraseBegin - 1, 
                      phraseEnd)
            if not phraseDict.has_key(newPhraseOffset):
                #print "NEW PHRASE:", ETUtils.toStr(newPhrase)
                newPhrases.append(newPhrase)
                phraseDict[newPhraseOffset] = [newPhrase]
    return newPhrases
Exemplo n.º 4
0
def fixAltOffsets(input, output=None):
    print >> sys.stderr, "Loading corpus", input
    corpusTree = ETUtils.ETFromObj(input)
    print >> sys.stderr, "Corpus file loaded"
    corpusRoot = corpusTree.getroot()
    
    docCount = 0
    sentencesCreated = 0
    sentences = [x for x in corpusRoot.getiterator("sentence")]
    counter = ProgressCounter(len(sentences), "FixAltOffsets")
    fixCount = 0
    # fix spans
    for sentence in sentences:
        counter.update(1, "Fixing AltOffsets for sentence ("+sentence.get("id")+"): ")
        sentOffset = Range.charOffsetToSingleTuple(sentence.get("charOffset"))
        for entity in sentence.findall("entity"):
            altOffsetString = entity.get("altOffset")
            if altOffsetString == None:
                continue
            #print altOffsetString
            altOffsets = Range.charOffsetToTuples(altOffsetString)
            assert len(altOffsets) == 1
            for i in range(len(altOffsets)):
                altOffset = altOffsets[i] 
                altOffsets[i] = (altOffset[0] - sentOffset[0], altOffset[1] - sentOffset[0])
            entity.set("altOffset", Range.tuplesToCharOffset(altOffsets))
            fixCount += 1
        
    print >> sys.stderr, "Fixed", fixCount, "altOffsets"
        
    if output != None:
        print >> sys.stderr, "Writing output to", output
        ETUtils.write(corpusRoot, output)
    return corpusTree
Exemplo n.º 5
0
def makeINSubPhrases(phrases, tokens, phraseDict, filter=None):
    newPhrases = []
    for phrase in phrases:
        if filter != None and phrase.get("type") not in filter:
            continue
        phraseOffset = Range.charOffsetToSingleTuple(phrase.get("charOffset"))
        phraseBegin = int(phrase.get("begin"))
        phraseEnd = int(phrase.get("end"))
        prevToken = None
        tokCount = 0
        for token in tokens[phraseBegin:phraseEnd + 1]:
            if token.get("POS") == "IN" and prevToken != None:
                newPhraseOffset = (phraseOffset[0],
                                   Range.charOffsetToSingleTuple(
                                       prevToken.get("charOffset"))[-1])
                newPhrase = makePhrase(
                    phrase.get("type") + "-IN", newPhraseOffset, phraseBegin,
                    phraseBegin + tokCount - 1)
                if not phraseDict.has_key(newPhraseOffset):
                    #print "NEW PHRASE:", ETUtils.toStr(newPhrase)
                    newPhrases.append(newPhrase)
                    phraseDict[newPhraseOffset] = [newPhrase]
            prevToken = token
            tokCount += 1
    return newPhrases
Exemplo n.º 6
0
def getHeads(corpus):
    corpus = ETUtils.ETFromObj(corpus)
    headDict = {}
    headDict["None"] = {}
    for sentence in corpus.getiterator("sentence"):
        headOffsetStrings = set()
        for entity in sentence.findall("entity"):
            eType = entity.get("type")
            if not headDict.has_key(eType):
                headDict[eType] = {}
            eText = entity.get("text")
            headOffset = entity.get("headOffset")
            headOffsetStrings.add(headOffset)
            headOffset = Range.charOffsetToSingleTuple(headOffset)
            charOffset = Range.charOffsetToSingleTuple(entity.get("charOffset"))
            if headOffset == charOffset:
                if not headDict[eType].has_key(eText): headDict[eType][eText] = 0
                headDict[eType][eText] += 1
            else:
                headText = sentenceText[headOffset[0]-charOffset[0]:headOffset[1]-charOffset[0]+1]
                if not headDict[eType].has_key(headText): headDict[eType][headText] = 0
                headDict[eType][headText] += 1
        for token in tokens:
            if not token.get("charOffset") in headOffsetStrings: # token is not the head of any entity
                headText = token.get("text")
                if not headDict["None"].has_key(headText): headDict["None"][headText] = 0
                headDict["None"][headText] += 1
                
    return headDict
Exemplo n.º 7
0
def selectBestMatch(entity, phrases):
    entOffset = Range.charOffsetToSingleTuple(entity.get("charOffset"))
    if entity.get("altOffset") != None:
        entOffset = Range.charOffsetToSingleTuple(entity.get("altOffset"))
    best = (sys.maxint, None)
    for phrase in phrases:
        matchValue = Range.mismatch(entOffset, Range.charOffsetToSingleTuple(phrase.get("charOffset")))
        if best[0] > matchValue:
            best = (matchValue, phrase)
    return best[1]
Exemplo n.º 8
0
def selectBestMatch(entity, phrases):
    entOffset = Range.charOffsetToSingleTuple(entity.get("charOffset"))
    if entity.get("altOffset") != None:
        entOffset = Range.charOffsetToSingleTuple(entity.get("altOffset"))
    best = (sys.maxint, None)
    for phrase in phrases:
        matchValue = Range.mismatch(
            entOffset, Range.charOffsetToSingleTuple(phrase.get("charOffset")))
        if best[0] > matchValue:
            best = (matchValue, phrase)
    return best[1]
Exemplo n.º 9
0
def getNECounts(phrases, entities):
    counts = {}
    for phrase in phrases:
        phraseOffset = Range.charOffsetToSingleTuple(phrase.get("charOffset"))
        counts[phrase] = 0
        for entity in entities:
            if entity.get("isName") != "True": # only check names
                continue
            if Range.contains(phraseOffset, Range.charOffsetToSingleTuple(entity.get("charOffset"))):
                counts[phrase] += 1
    return counts
Exemplo n.º 10
0
 def getTokens(self, entity, tokenTuples):
     offset = entity.get("charOffset")
     assert offset != None
     offset = Range.charOffsetToSingleTuple(offset)
     match = []
     for tokenTuple in tokenTuples:
         if Range.overlap(offset, tokenTuple[0]):
             match.append(tokenTuple[1].get("text"))
         elif len(match) > 0: # passed end
             break
     return match
Exemplo n.º 11
0
 def getTokens(self, entity, tokenTuples):
     offset = entity.get("charOffset")
     assert offset != None
     offset = Range.charOffsetToSingleTuple(offset)
     match = []
     for tokenTuple in tokenTuples:
         if Range.overlap(offset, tokenTuple[0]):
             match.append(tokenTuple[1].get("text"))
         elif len(match) > 0:  # passed end
             break
     return match
Exemplo n.º 12
0
def moveElements(document):
    entMap = {}
    entSentence = {}
    entSentenceIndex = {}
    sentences = document.findall("sentence")
    sentenceCount = 0
    for sentence in sentences:
        sentenceOffset = Range.charOffsetToSingleTuple(
            sentence.get("charOffset"))
        # Move entities
        entCount = 0
        for entity in document.findall("entity"):
            entityOffset = Range.charOffsetToSingleTuple(
                entity.get("charOffset"))
            if Range.overlap(sentenceOffset, entityOffset):
                document.remove(entity)
                sentence.append(entity)
                entityId = entity.get("id")
                entityIdLastPart = entityId.rsplit(".", 1)[-1]
                if entityIdLastPart.startswith("e"):
                    entity.set("id",
                               sentence.get("id") + "." + entityIdLastPart)
                    entMap[entityId] = sentence.get(
                        "id") + "." + entityIdLastPart
                else:
                    entity.set("docId", entityId)
                    entity.set("id", sentence.get("id") + ".e" + str(entCount))
                    entMap[entityId] = sentence.get("id") + ".e" + str(
                        entCount)
                entSentence[entityId] = sentence
                entSentenceIndex[entityId] = sentenceCount
                newEntityOffset = (entityOffset[0] - sentenceOffset[0],
                                   entityOffset[1] - sentenceOffset[0])
                entity.set("origOffset", entity.get("charOffset"))
                entity.set(
                    "charOffset",
                    str(newEntityOffset[0]) + "-" + str(newEntityOffset[1]))
                entCount += 1
        sentenceCount += 1
    # Move interactions
    intCount = 0
    for interaction in document.findall("interaction"):
        if entSentenceIndex[interaction.get("e1")] < entSentenceIndex[
                interaction.get("e2")]:
            targetSentence = entSentence[interaction.get("e1")]
        else:
            targetSentence = entSentence[interaction.get("e2")]
        document.remove(interaction)
        targetSentence.append(interaction)
        interaction.set("id", targetSentence.get("id") + ".i" + str(intCount))
        interaction.set("e1", entMap[interaction.get("e1")])
        interaction.set("e2", entMap[interaction.get("e2")])
        intCount += 1
Exemplo n.º 13
0
def fixEntities(xml):
    counts = defaultdict(int)
    for sentence in xml.getiterator("sentence"):
        sText = sentence.get("text")
        for entity in sentence.findall("entity"):
            charOffset = entity.get("charOffset")
            if charOffset == "-":
                assert False, str(entity)
                sentence.remove(entity)
                counts["removed-invalid"] += 1
            else:
                charOffset = Range.charOffsetToSingleTuple(charOffset)
                # fix length
                realLength = len(entity.get("text"))
                lenDiff = (charOffset[1] - charOffset[0] + 1) - realLength
                if lenDiff != realLength:
                    counts["incorrect-ent-offset"] += 1
                    counts["incorrect-ent-offset-diff"+str(lenDiff)] += 1
                    if abs(lenDiff) > 2:
                        print "Warning, lenDiff:", (lenDiff, charOffset, sText, entity.get("text"), entity.get("id"))
                charOffset = (charOffset[0], charOffset[0] + realLength-1)
                # find starting position
                entIndex = sText.find(entity.get("text"), charOffset[0])
                if entIndex == -1:
                    for i in [-1,-2,-3]:
                        entIndex = sText.find(entity.get("text"), charOffset[0]+i)
                        if entIndex != -1:
                            break
                if entIndex != 0: # could be lowercase
                    sTextLower = sText.lower()
                    for i in [0,-1,-2,-3]:
                        lowerEntIndex = sTextLower.find(entity.get("text"), charOffset[0]+i)
                        if lowerEntIndex != -1:
                            break
                    if lowerEntIndex != -1 and abs(lowerEntIndex - charOffset[0]) < abs(entIndex - charOffset[0]):
                        entIndex = lowerEntIndex
                assert entIndex != -1, (charOffset, sText, entity.get("text"), entity.get("id"))
                indexDiff = entIndex - charOffset[0]
                if indexDiff != 0:
                    counts["incorrect-ent-index"] += 1
                    counts["incorrect-ent-index-diff"+str(indexDiff)] += 1
                    print "Warning, indexDiff:", (indexDiff, charOffset, sText, entity.get("text"), entity.get("id"))
                # move offset       
                charOffset = (charOffset[0]+indexDiff, charOffset[1]+indexDiff)
                # validate new offset
                sEntity = sText[charOffset[0]:charOffset[1]+1]
                assert sEntity == entity.get("text") or sEntity.lower() == entity.get("text"), (charOffset, sText, entity.get("text"), entity.get("id"))
                entity.set("charOffset", Range.tuplesToCharOffset( (charOffset[0], charOffset[1])))
                entity.set("isName", "True")
        for interaction in sentence.findall("interaction"):
            interaction.set("type", "DDI")
    print "Fix counts:", counts
Exemplo n.º 14
0
 def getRelativePosition(self, entity1Range, entity2Range, token):
     offset = Range.charOffsetToSingleTuple(token.get("charOffset"))
     if Range.overlap(entity1Range, offset):
         return "Entity1"
     if Range.overlap(entity2Range, offset):
         return "Entity2"
     entitiesRange = (min(entity1Range[0],entity2Range[0]),max(entity1Range[1],entity2Range[1]))
     if offset[1] < entitiesRange[0]:
         return "Fore"
     elif offset[1] > entitiesRange[1]:
         return "After"
     else:
         return "Between"
Exemplo n.º 15
0
def getNECounts(phrases, entities):
    counts = {}
    for phrase in phrases:
        phraseOffset = Range.charOffsetToSingleTuple(phrase.get("charOffset"))
        counts[phrase] = 0
        for entity in entities:
            if entity.get("isName") != "True":  # only check names
                continue
            if Range.contains(
                    phraseOffset,
                    Range.charOffsetToSingleTuple(entity.get("charOffset"))):
                counts[phrase] += 1
    return counts
Exemplo n.º 16
0
 def getRelativePosition(self, entity1Range, entity2Range, token):
     offset = Range.charOffsetToSingleTuple(token.get("charOffset"))
     if Range.overlap(entity1Range, offset):
         return "Entity1"
     if Range.overlap(entity2Range, offset):
         return "Entity2"
     entitiesRange = (min(entity1Range[0], entity2Range[0]),
                      max(entity1Range[1], entity2Range[1]))
     if offset[1] < entitiesRange[0]:
         return "Fore"
     elif offset[1] > entitiesRange[1]:
         return "After"
     else:
         return "Between"
Exemplo n.º 17
0
def getMatchingPhrases(entity, phraseOffsets, phraseDict):
    matches = []
    if entity.get("isName") == "True":
        return []
    maxOffset = Range.charOffsetToSingleTuple(entity.get("charOffset"))
    minOffset = entity.get("altOffset")
    if minOffset != None:
        minOffset = Range.charOffsetToSingleTuple(minOffset)
    else:
        minOffset = maxOffset
    for phraseOffset in phraseOffsets:
        if Range.contains(maxOffset, phraseOffset) and Range.contains(phraseOffset, minOffset):
            matches.extend(phraseDict[phraseOffset])
    return matches
Exemplo n.º 18
0
def getMatchingPhrases(entity, phraseOffsets, phraseDict):
    matches = []
    if entity.get("isName") == "True":
        return []
    maxOffset = Range.charOffsetToSingleTuple(entity.get("charOffset"))
    minOffset = entity.get("altOffset")
    if minOffset != None:
        minOffset = Range.charOffsetToSingleTuple(minOffset)
    else:
        minOffset = maxOffset
    for phraseOffset in phraseOffsets:
        if Range.contains(maxOffset, phraseOffset) and Range.contains(
                phraseOffset, minOffset):
            matches.extend(phraseDict[phraseOffset])
    return matches
Exemplo n.º 19
0
    def get_bins(self, bin_dictionary, column_name):
        return_bins = {}
        master_list = []
        key = bin_dictionary.keys()
        lst_dom = self.read_data()
        index_parent = independent_var.index(key[0])
        indexes_of_columns = []
        for c in column_name:
            index_col = independent_var.index(c)
            indexes_of_columns.append(index_col)
        list_to_look_up = bin_dictionary.values()[0]

        for index in indexes_of_columns:
            list_to_return = []
            for lst in list_to_look_up:
                for lst_d in lst_dom:
                    if lst[0] in lst_d and lst[1] in lst_d:
                        if lst[0] == lst_d[index_parent] and lst[1] == lst_d[3]:
                            list_to_return.append([lst_d[index], lst_d[3]])
            master_list.append(deepcopy(list_to_return))
        sorted_master_list = []
        for k in range(len(master_list)):
            sorted_l = sorted(master_list[k])
            r = Range.main(sorted_l, self.first_value)
            breaks = Superrange.main(r, self.second_value)
            len_breaks = len(breaks)
            bins = self.return_splitted_list(sorted_l, breaks)
            bin_dict = {column_name[k]: bins}
            return_bins.update(bin_dict)
            sorted_master_list.append(deepcopy(sorted_l))

        return (return_bins)
Exemplo n.º 20
0
    def interactionWordToElement(self, sentenceId, entitiesById):
        interactionWordElement = None
        #        for entity in entitiesById.values():
        #            if entity.attrib["charOffset"] == Range.tuplesToCharOffset(self.clueTypeCharOffsets):
        #                interactionWordElement = entity
        #                interactionWordElement.attrib["type"] = self.type
        #                interactionWordElement.attrib["isName"] = "False"
        #                break
        if interactionWordElement == None:
            interactionWordElement = ET.Element("entity")
            interactionWordElement.attrib["origId"] = self.id
            interactionWordElement.attrib["type"] = self.type
            interactionWordElement.attrib["isName"] = "False"
            interactionWordElement.attrib[
                "charOffset"] = Range.tuplesToCharOffset(
                    self.clueTypeCharOffsets)
            if interactionWordElement.attrib["charOffset"] == "":
                return
#            if self.headToken != None:
#                interactionWordElement.attrib["headOffset"] = Range.tuplesToCharOffset(self.headToken.charOffset)
#            else:
#                interactionWordElement.attrib["headOffset"] = interactionWordElement.attrib["charOffset"]
            interactionWordElement.attrib["text"] = str(self.clueTypeTexts)
            interactionWordElement.attrib["id"] = sentenceId + ".e" + str(
                len(entitiesById))
            entitiesById[self.id] = interactionWordElement
Exemplo n.º 21
0
 def prepareTokens(self, tokens):
     tokenTuples = []
     for token in tokens:
         tokenTuples.append(
             (Range.charOffsetToSingleTuple(token.get("charOffset")),
              token))
     return tokenTuples
Exemplo n.º 22
0
def writeEventTriggers(document, inputCorpus, outputFile, events, triggerIds, task=1, strengths=False):
    entityIndex = 0
    # Find new entity index
    for sentenceElement in document.findall("sentence"):
        sentence = inputCorpus.sentencesById[sentenceElement.get("id")]
        entityIndex = getEntityIndex(sentence.entities, entityIndex, task)
    
    eventIdStems = set()
    for key in events.keys():
        for interaction in events[key]:
            site = interaction[1]
            if site != None:
                eventIdStems.add(site.get("e1"))
        if key.find("comb") != -1:
            eventIdStems.add(key.rsplit(".",1)[0])
        else:
            eventIdStems.add(key)
    # Write entities
    offsetMap = {}
    entityIndex += 1
    for sentenceElement in document.findall("sentence"):
        sentence = inputCorpus.sentencesById[sentenceElement.get("id")]
        sentenceOffset = Range.charOffsetToSingleTuple(sentenceElement.get("charOffset"))
        for entity in sentence.entities:
            if entity.get("isName") == "False":
                if entity.get("id") in eventIdStems:
                    entityOffset = Range.charOffsetToSingleTuple(entity.get("charOffset"))
                    newOffset = getGeniaOffset(sentenceOffset, entityOffset)
                    match = Range.tuplesToCharOffset(newOffset) + "_" + entity.get("type")
                    if match in offsetMap.keys():
                        #assert(not triggerIds.has_key(entity.get("id")))
                        if triggerIds.has_key(entity.get("id")):
                            print >> sys.stderr, "Warning: Duplicate entity (trigger)", entity.get("id"), entity.get("type")
                        triggerIds[entity.get("id")] = offsetMap[match]
                    else:
                        triggerId = "T" + str(entityIndex)
                        strengthLine = ""
                        if strengths != None and entity.get("predictions") != None:
                            #strengthLine = " # " + entity.get("predictions")
                            strengths.write ( encode(triggerId + "\t" + entity.get("predictions") + "\n") )
                        outputFile.write( encode(triggerId + "\t" + entity.get("type") + " " + str(newOffset[0]) + " " + str(newOffset[1]) + "\t" + entity.get("text") + strengthLine + "\n") )
                        offsetMap[match] = triggerId
                        assert(not triggerIds.has_key(entity.get("id")))
                        triggerIds[entity.get("id")] = triggerId
                        entityIndex += 1
    return triggerIds
Exemplo n.º 23
0
def getMatchingPhrases(entity, phraseOffsets, phraseDict):
    matches = []
    if entity.get("isName") == "True":
        return []
    maxOffset = Range.charOffsetToSingleTuple(entity.get("charOffset"))
    minOffset = entity.get("altOffset")
    if minOffset != None:
        minOffset = Range.charOffsetToSingleTuple(minOffset)
    else:
        if entity.get("type") in ["Host", "HostPart", "Geographical", "Environmental", "Food", "Medical", "Soil", "Water"]:
            minOffset = Range.charOffsetToSingleTuple(entity.get("headOffset"))
        else:
            minOffset = maxOffset
    for phraseOffset in phraseOffsets:
        if Range.contains(maxOffset, phraseOffset) and Range.contains(phraseOffset, minOffset):
            matches.extend(phraseDict[phraseOffset])
    return matches
Exemplo n.º 24
0
    def getPatterns(self, e1, e2):
        e1Range = Range.charOffsetToSingleTuple(e1.get("charOffset"))
        e2Range = Range.charOffsetToSingleTuple(e2.get("charOffset"))

        tokenPositions = {}
        for token in self.sentenceGraph.tokens:
            tokenPositions[token.get("id")] = self.getRelativePosition(
                e1Range, e2Range, token)

        prevTokenText = None
        prevToken2Text = None
        prevPosition = None
        patternForeBetween = {}
        patternBetween = {}
        patternBetweenAfter = {}
        for token in self.sentenceGraph.tokens:
            if self.sentenceGraph.tokenIsName[token]:
                continue

            id = token.get("id")
            text = token.get("text").lower()

            if prevPosition != tokenPositions[id]:
                prevTokenText = None
                prevToken2Text = None

            if tokenPositions[id] == "Fore":
                self.addToPattern(patternForeBetween, text, prevTokenText,
                                  prevToken2Text)
            elif tokenPositions[id] == "Between":
                self.addToPattern(patternForeBetween, text, prevTokenText,
                                  prevToken2Text)
                self.addToPattern(patternBetween, text, prevTokenText,
                                  prevToken2Text)
                self.addToPattern(patternBetweenAfter, text, prevTokenText,
                                  prevToken2Text)
            elif tokenPositions[id] == "After":
                self.addToPattern(patternBetweenAfter, text, prevTokenText,
                                  prevToken2Text)

            prevPosition = tokenPositions[id]
            #if tokenPositions[id].find("Entity") != -1:
            prevToken2Text = prevTokenText
            prevTokenText = text

        return patternForeBetween, patternBetween, patternBetweenAfter
Exemplo n.º 25
0
def getPhraseDict(phrases):
    phraseDict = {}   
    # Define offsets
    for phrase in phrases:
        phraseOffset = Range.charOffsetToSingleTuple(phrase.get("charOffset"))
        if not phraseDict.has_key(phraseOffset):
            phraseDict[phraseOffset] = []
        phraseDict[phraseOffset].append(phrase)
    return phraseDict
Exemplo n.º 26
0
def makeDETSubPhrases(phrases, tokens, phraseDict, filter=None):
    newPhrases = []
    for phrase in phrases:
        if filter != None and phrase.get("type") not in filter:
            continue
        phraseOffset = Range.charOffsetToSingleTuple(phrase.get("charOffset"))
        phraseBegin = int(phrase.get("begin"))
        phraseEnd = int(phrase.get("end"))
        if phraseBegin > 0 and tokens[phraseBegin - 1].get("POS") == "DT":
            newPhraseOffset = (Range.charOffsetToSingleTuple(
                tokens[phraseBegin - 1].get("charOffset"))[0], phraseOffset[1])
            newPhrase = makePhrase("DT-" + phrase.get("type"), newPhraseOffset,
                                   phraseBegin - 1, phraseEnd)
            if not phraseDict.has_key(newPhraseOffset):
                #print "NEW PHRASE:", ETUtils.toStr(newPhrase)
                newPhrases.append(newPhrase)
                phraseDict[newPhraseOffset] = [newPhrase]
    return newPhrases
Exemplo n.º 27
0
def getPhraseDict(phrases):
    phraseDict = {}
    # Define offsets
    for phrase in phrases:
        phraseOffset = Range.charOffsetToSingleTuple(phrase.get("charOffset"))
        if not phraseDict.has_key(phraseOffset):
            phraseDict[phraseOffset] = []
        phraseDict[phraseOffset].append(phrase)
    return phraseDict
Exemplo n.º 28
0
def moveElements(document):
    entMap = {}
    entSentence = {}
    entSentenceIndex = {}
    sentences = document.findall("sentence")
    sentenceCount = 0
    for sentence in sentences:
        sentenceOffset = Range.charOffsetToSingleTuple(sentence.get("charOffset"))
        # Move entities
        entCount = 0
        for entity in document.findall("entity"):
            entityOffset = Range.charOffsetToSingleTuple(entity.get("charOffset"))
            if Range.overlap(sentenceOffset, entityOffset):
                document.remove(entity)
                sentence.append(entity)
                entityId = entity.get("id")
                entityIdLastPart = entityId.rsplit(".", 1)[-1]
                if entityIdLastPart.startswith("e"):
                    entity.set("id", sentence.get("id") + "." + entityIdLastPart)
                    entMap[entityId] = sentence.get("id") + "." + entityIdLastPart
                else:
                    entity.set("docId", entityId)
                    entity.set("id", sentence.get("id") + ".e" + str(entCount))
                    entMap[entityId] = sentence.get("id") + ".e" + str(entCount)
                entSentence[entityId] = sentence
                entSentenceIndex[entityId] = sentenceCount
                newEntityOffset = (entityOffset[0] - sentenceOffset[0], entityOffset[1] - sentenceOffset[0])
                entity.set("origOffset", entity.get("charOffset"))
                entity.set("charOffset", str(newEntityOffset[0]) + "-" + str(newEntityOffset[1])) 
                entCount += 1
        sentenceCount += 1
    # Move interactions
    intCount = 0
    for interaction in document.findall("interaction"):
        if entSentenceIndex[interaction.get("e1")] < entSentenceIndex[interaction.get("e2")]:
            targetSentence = entSentence[interaction.get("e1")]
        else:
            targetSentence = entSentence[interaction.get("e2")]
        document.remove(interaction)
        targetSentence.append(interaction)
        interaction.set("id", targetSentence.get("id") + ".i" + str(intCount))
        interaction.set("e1", entMap[interaction.get("e1")])
        interaction.set("e2", entMap[interaction.get("e2")])
        intCount += 1
Exemplo n.º 29
0
 def calculateE(self):
     self.dataList = calc.interpolate(self.dataList)
     self.resultList = calc.calculateE(self.dataList[0], self.dataList[1])
     for r in self.rangeList:
         try:
             self.thermalEffectList = Range.InsertThermalEffect(
                 self.thermalEffectList, r)
         except:
             self.showWarning("Błąd obliczeń",\
                 "Nieprawidłowa funkcja",\
                 "Uwaga",\
                 "Funkcja podana w przemianie: {} jest nieprawidłowa".format(r.name))
             return
         print("Min: {}\nMax: {}".format(r.start, r.end))
     print(self.thermalEffectList)
     self.resultList = Range.calculateFinalEntalphy(self.dataList[0],
                                                    self.resultList,
                                                    self.thermalEffectList)
     self.thermalEffectList = []
Exemplo n.º 30
0
    def __init__(self):
        #WINDOW INIT------------------------------------------------------
        QMainWindow.__init__(self)
        loadUi("resources/testv2.ui", self)

        self.setWindowTitle("Kalkulator Entalpii")

        #BACK------------------------------------
        self.loadFile()
        self.rangeList, i = Range.InsertNewRange(self.rangeList,Range.Range(self.min,self.max,self.dataList[0][0],\
            self.dataList[0][1], "x", 20, "{} - Przemiana".format(self.rangeComboBox.count()+1)))
        print(len(self.rangeList))

        #FRONT-----------------------------------
        self.rangeParametersEnabled(True)

        self.nextButton.clicked.connect(self.nextButtonClicked)
        self.previousButton.clicked.connect(self.previousButtonClicked)
        self.firstButton.clicked.connect(self.firstButtonClicked)
        self.lastButton.clicked.connect(self.lastButtonClicked)

        self.newButton.clicked.connect(self.newButtonClicked)
        self.removeButton.clicked.connect(self.removeButtonClicked)

        self.rangeComboBox.currentIndexChanged.connect(
            self.onRangeComboboxChanged)

        self.methodComboBox.currentIndexChanged.connect(
            self.onMethodComboboxChanged)

        self.saveButton.clicked.connect(self.saveButtonClicked)
        self.drawButton.clicked.connect(self.drawButtonClicked)

        self.savePlotButton.clicked.connect(self.savePlot)

        self.actionLoad.triggered.connect(self.loadFileDialogBox)
        self.actionSaveResult.triggered.connect(self.saveFileDialogBox)
        self.actionSaveResultAndData.triggered.connect(
            self.saveBigFileDialogBox)
        self.addToolBar(NavigationToolbar(self.MplWidget.canvas, self))

        self.addRangeToComboBox(0)
Exemplo n.º 31
0
def makeTokenSubPhrases(tokens, phraseDict, includePOS=["NN", "PRP$", "IN", "WP$"]):
    newPhrases = []
    for i in range(len(tokens)):
        token = tokens[i]
        tokPOS = token.get("POS")
        if includePOS == None or tokPOS in includePOS:
            tokOffset = Range.charOffsetToSingleTuple(token.get("charOffset"))
            if not phraseDict.has_key(tokOffset):
                newPhrase = makePhrase("TOK-t" + tokPOS, tokOffset, i, i)
                newPhrases.append(newPhrase)
                phraseDict[tokOffset] = [newPhrase]
    return newPhrases
Exemplo n.º 32
0
def makeTokenSubPhrases(tokens, phraseDict, includePOS=["PRP$", "IN", "WP$"]):
    newPhrases = []
    for i in range(len(tokens)):
        token = tokens[i]
        tokPOS = token.get("POS")
        if tokPOS in includePOS:
            tokOffset = Range.charOffsetToSingleTuple(token.get("charOffset"))
            if not phraseDict.has_key(tokOffset):
                newPhrase = makePhrase("TOK-t" + tokPOS, tokOffset, i, i)
                newPhrases.append(newPhrase)
                phraseDict[tokOffset] = [newPhrase]
    return newPhrases
Exemplo n.º 33
0
 def getPatterns(self, e1, e2):
     e1Range = Range.charOffsetToSingleTuple(e1.get("charOffset"))
     e2Range = Range.charOffsetToSingleTuple(e2.get("charOffset"))
     
     tokenPositions = {}
     for token in self.sentenceGraph.tokens:
         tokenPositions[token.get("id")] = self.getRelativePosition(e1Range,e2Range,token)
     
     prevTokenText = None
     prevToken2Text = None
     prevPosition = None
     patternForeBetween = {}
     patternBetween = {}
     patternBetweenAfter = {}
     for token in self.sentenceGraph.tokens:
         if self.sentenceGraph.tokenIsName[token]:
             continue
             
         id = token.get("id")
         text = token.get("text").lower()
         
         if prevPosition != tokenPositions[id]:
             prevTokenText = None
             prevToken2Text = None
         
         if tokenPositions[id] == "Fore":
             self.addToPattern(patternForeBetween, text, prevTokenText, prevToken2Text)
         elif tokenPositions[id] == "Between":
             self.addToPattern(patternForeBetween, text, prevTokenText, prevToken2Text)
             self.addToPattern(patternBetween, text, prevTokenText, prevToken2Text)
             self.addToPattern(patternBetweenAfter, text, prevTokenText, prevToken2Text)
         elif tokenPositions[id] == "After":
             self.addToPattern(patternBetweenAfter, text, prevTokenText, prevToken2Text)
         
         prevPosition = tokenPositions[id]
         #if tokenPositions[id].find("Entity") != -1:
         prevToken2Text = prevTokenText
         prevTokenText = text
 
     return patternForeBetween, patternBetween, patternBetweenAfter
Exemplo n.º 34
0
def writeProteins(document, inputCorpus, outputFile=None):
    entityMap = {}
    offsetMap = {}
    triggerMap = {}
    for sentenceElement in document.findall("sentence"):
        sentence = inputCorpus.sentencesById[sentenceElement.get("id")]
        sentenceOffset = Range.charOffsetToSingleTuple(sentenceElement.get("charOffset"))
        for entity in sentence.entities:
            if entity.get("isName") == "True":
                origId = entity.get("origId").split(".")[-1]
                origIdNumber = int(origId[1:])
                assert(origIdNumber not in entityMap.keys())
                entityMap[origIdNumber] = entity
                
                entityOffset = Range.charOffsetToSingleTuple(entity.get("charOffset"))
                offsetMap[origIdNumber] = getGeniaOffset(sentenceOffset, entityOffset)
                triggerMap[entity.get("id")] = origId
    for key in sorted(entityMap.keys()):
        entity = entityMap[key]
        if outputFile != None:
            outputFile.write( encode(triggerMap[entity.get("id")] + "\tProtein " + str(offsetMap[key][0]) + " " + str(offsetMap[key][1]) + "\t" + entity.get("text") + "\n") )
    return triggerMap
Exemplo n.º 35
0
def fixIndices(phrases, tokens):
    fixCount = 0
    phraseCount = 0
    for phrase in phrases:
        fixed = False
        phraseOffset = Range.charOffsetToSingleTuple(phrase.get("charOffset"))
        phraseBegin = int(phrase.get("begin"))
        phraseEnd = int(phrase.get("end"))
        for i in range(len(tokens)):
            token = tokens[i]
            tokOffset = Range.charOffsetToSingleTuple(token.get("charOffset"))
            if tokOffset[0] == phraseOffset[0]:
                if phraseBegin != i:
                    phrase.set("begin", str(i))
                    fixed = True
            if tokOffset[1] == phraseOffset[1]:
                if phraseEnd != i:
                    phrase.set("end", str(i))
                    fixed = True
                break
        if fixed:
            fixCount += 1
        phraseCount += 1
Exemplo n.º 36
0
def fixIndices(phrases, tokens):
    fixCount = 0
    phraseCount = 0
    for phrase in phrases:
        fixed = False
        phraseOffset = Range.charOffsetToSingleTuple(phrase.get("charOffset"))
        phraseBegin = int(phrase.get("begin"))
        phraseEnd = int(phrase.get("end"))
        for i in range(len(tokens)):
            token = tokens[i]
            tokOffset = Range.charOffsetToSingleTuple(token.get("charOffset"))
            if tokOffset[0] == phraseOffset[0]:
                if phraseBegin != i:
                    phrase.set("begin", str(i))
                    fixed = True
            if tokOffset[1] == phraseOffset[1]:
                if phraseEnd != i:
                    phrase.set("end", str(i))
                    fixed = True
                break
        if fixed:
            fixCount += 1
        phraseCount += 1
Exemplo n.º 37
0
def getEntityHeadToken(entity, tokens, tokenHeadScores):
    if entity.get("headOffset") != None:
        charOffsets = Range.charOffsetToTuples(entity.get("headOffset"))
    elif entity.get("charOffset") != "":
        charOffsets = Range.charOffsetToTuples(entity.get("charOffset"))
    else:
        charOffsets = []
    # Each entity can consist of multiple syntactic tokens, covered by its
    # charOffset-range. One of these must be chosen as the head token.
    headTokens = [] # potential head tokens
    for token in tokens:
        tokenOffset = Range.charOffsetToSingleTuple(token.get("charOffset"))
        for offset in charOffsets:
            if Range.overlap(offset, tokenOffset):
                headTokens.append(token)
    if len(headTokens)==1: # An unambiguous head token was found
        selectedHeadToken = headTokens[0]
    else: # One head token must be chosen from the candidates
        selectedHeadToken = findHeadToken(headTokens, tokenHeadScores)
        #if verbose:
        #    print >> sys.stderr, "Selected head:", token.attrib["id"], token.attrib["text"]
    assert selectedHeadToken != None, entityElement.get("id")
    return selectedHeadToken
Exemplo n.º 38
0
def getEntityHeadToken(entity, tokens, tokenHeadScores):
    if entity.get("headOffset") != None:
        charOffsets = Range.charOffsetToTuples(entity.get("headOffset"))
    elif entity.get("charOffset") != "":
        charOffsets = Range.charOffsetToTuples(entity.get("charOffset"))
    else:
        charOffsets = []
    # Each entity can consist of multiple syntactic tokens, covered by its
    # charOffset-range. One of these must be chosen as the head token.
    headTokens = []  # potential head tokens
    for token in tokens:
        tokenOffset = Range.charOffsetToSingleTuple(token.get("charOffset"))
        for offset in charOffsets:
            if Range.overlap(offset, tokenOffset):
                headTokens.append(token)
    if len(headTokens) == 1:  # An unambiguous head token was found
        selectedHeadToken = headTokens[0]
    else:  # One head token must be chosen from the candidates
        selectedHeadToken = findHeadToken(headTokens, tokenHeadScores)
        #if verbose:
        #    print >> sys.stderr, "Selected head:", token.attrib["id"], token.attrib["text"]
    assert selectedHeadToken != None, entityElement.get("id")
    return selectedHeadToken
Exemplo n.º 39
0
def getHeads(corpus):
    corpus = ETUtils.ETFromObj(corpus)
    headDict = {}
    headDict["None"] = {}
    for sentence in corpus.getiterator("sentence"):
        headOffsetStrings = set()
        for entity in sentence.findall("entity"):
            eType = entity.get("type")
            if not headDict.has_key(eType):
                headDict[eType] = {}
            eText = entity.get("text")
            headOffset = entity.get("headOffset")
            headOffsetStrings.add(headOffset)
            headOffset = Range.charOffsetToSingleTuple(headOffset)
            charOffset = Range.charOffsetToSingleTuple(
                entity.get("charOffset"))
            if headOffset == charOffset:
                if not headDict[eType].has_key(eText):
                    headDict[eType][eText] = 0
                headDict[eType][eText] += 1
            else:
                headText = sentenceText[headOffset[0] -
                                        charOffset[0]:headOffset[1] -
                                        charOffset[0] + 1]
                if not headDict[eType].has_key(headText):
                    headDict[eType][headText] = 0
                headDict[eType][headText] += 1
        for token in tokens:
            if not token.get(
                    "charOffset"
            ) in headOffsetStrings:  # token is not the head of any entity
                headText = token.get("text")
                if not headDict["None"].has_key(headText):
                    headDict["None"][headText] = 0
                headDict["None"][headText] += 1

    return headDict
Exemplo n.º 40
0
    def writeXMLSentence(self,
                         examples,
                         predictionsByExample,
                         sentenceObject,
                         classSet,
                         classIds,
                         goldSentence=None):
        self.assertSameSentence(examples)

        sentenceElement = sentenceObject.sentence
        sentenceId = sentenceElement.get("id")
        sentenceText = sentenceElement.get("text")
        # detach analyses-element
        sentenceAnalysesElement = None
        sentenceAnalysesElement = sentenceElement.find("sentenceanalyses")
        if sentenceAnalysesElement != None:
            sentenceElement.remove(sentenceAnalysesElement)
        # remove pairs and interactions
        interactions = self.removeChildren(sentenceElement,
                                           ["pair", "interaction"])
        # remove entities
        newEntityIdCount = IDUtils.getNextFreeId(
            sentenceElement.findall("entity"))
        nonNameEntities = self.removeNonNameEntities(sentenceElement)

        # add new pairs
        for example in examples:
            prediction = predictionsByExample[example[0]]
            entityElement = ET.Element("entity")
            entityElement.attrib["isName"] = "False"
            headToken = example[3]["t"]
            for token in sentenceObject.tokens:
                if token.get("id") == headToken:
                    headToken = token
                    break
            entityElement.set("charOffset", example[3]["charOffset"])
            entityElement.set("headOffset", headToken.get("charOffset"))
            entityElement.set("phraseType", example[3]["ptype"])
            entOffset = Range.charOffsetToSingleTuple(example[3]["charOffset"])
            entityElement.set("text",
                              sentenceText[entOffset[0]:entOffset[1] + 1])
            entityElement.set("id", sentenceId + ".e" + str(newEntityIdCount))
            self.setElementType(entityElement, prediction, classSet, classIds)
            newEntityIdCount += 1
            sentenceElement.append(entityElement)

        # re-attach the analyses-element
        if sentenceAnalysesElement != None:
            sentenceElement.append(sentenceAnalysesElement)
Exemplo n.º 41
0
def makeINSubPhrases(phrases, tokens, phraseDict, filter=None):
    newPhrases = []
    for phrase in phrases:
        if filter != None and phrase.get("type") not in filter:
            continue
        phraseOffset = Range.charOffsetToSingleTuple(phrase.get("charOffset"))
        phraseBegin = int(phrase.get("begin"))
        phraseEnd = int(phrase.get("end"))
        prevToken = None
        tokCount = 0
        for token in tokens[phraseBegin:phraseEnd+1]:
            if token.get("POS") == "IN" and prevToken != None:
                newPhraseOffset = (phraseOffset[0], Range.charOffsetToSingleTuple(prevToken.get("charOffset"))[-1])
                newPhrase = makePhrase(phrase.get("type") + "-IN",
                          newPhraseOffset, 
                          phraseBegin, 
                          phraseBegin + tokCount-1)
                if not phraseDict.has_key(newPhraseOffset):
                    #print "NEW PHRASE:", ETUtils.toStr(newPhrase)
                    newPhrases.append(newPhrase)
                    phraseDict[newPhraseOffset] = [newPhrase]
            prevToken = token
            tokCount += 1
    return newPhrases
Exemplo n.º 42
0
def buildEntityElement(interactionGraphNode, sentenceText):
    entityElement = ET.Element("entity")
    subTokens = interactionGraphNode.entity.token.getNested()
    startPos = int(subTokens[0].offset_bgn)
    offset = getEntityOffset(subTokens, sentenceText)
    entityElement.attrib["charOffset"] = Range.tuplesToCharOffset(offset)
    entityElement.attrib["origId"] = interactionGraphNode.entity.id
    entityElement.attrib["id"] = None
    entityElement.attrib["type"] = interactionGraphNode.entity.type
    entityElement.attrib["isName"] = str(interactionGraphNode.entity.isName)
    texts = getTextByOffsets(offset, sentenceText)
    if len(texts) == 1:
        entityElement.attrib["text"] = texts[0]
    else:
        entityElement.attrib["text"] = str(texts)
    return (entityElement, startPos)
Exemplo n.º 43
0
 def findHeadTokenSimple(self, charOffsets):
     # Takes always leftmost token
     tokenKeys = self.tokensById.keys()
     tokenKeys.sort()
     candidateTokens = set()
     for charOffset in charOffsets:
         for key in tokenKeys:
             token = self.tokensById[key]
             if Range.overlap(charOffset, token.charOffset):
                 candidateTokens.add(token.id)
     if len(candidateTokens) == 0:
         return None
     else:
         candidateTokens = list(candidateTokens)
         candidateTokens.sort()
         return self.tokensById[candidateTokens[0]]
def buildEntityElement(interactionGraphNode, sentenceText):
    entityElement = ET.Element("entity")
    subTokens = interactionGraphNode.entity.token.getNested()
    startPos = int(subTokens[0].offset_bgn)
    offset = getEntityOffset(subTokens, sentenceText)
    entityElement.attrib["charOffset"] = Range.tuplesToCharOffset(offset)
    entityElement.attrib["origId"] = interactionGraphNode.entity.id
    entityElement.attrib["id"] = None
    entityElement.attrib["type"] = interactionGraphNode.entity.type
    entityElement.attrib["isName"] = str(interactionGraphNode.entity.isName)
    texts = getTextByOffsets(offset, sentenceText)
    if len(texts) == 1:
        entityElement.attrib["text"] = texts[0]
    else:
        entityElement.attrib["text"] = str(texts)
    return (entityElement, startPos)
Exemplo n.º 45
0
 def findHeadTokenSimple(self, charOffsets):
     # Takes always leftmost token
     tokenKeys = self.tokensById.keys()
     tokenKeys.sort()
     candidateTokens = set()
     for charOffset in charOffsets:
         for key in tokenKeys:
             token = self.tokensById[key]
             if Range.overlap(charOffset, token.charOffset):
                 candidateTokens.add(token.id)
     if len(candidateTokens)==0:
         return None
     else:
         candidateTokens = list(candidateTokens)
         candidateTokens.sort()
         return self.tokensById[candidateTokens[0]]
Exemplo n.º 46
0
 def markBioInferInteractions(self, interactions):
     """ Marks tokens belonging to a BioInfer interaction
     """
     interactionTokens = []
     for interaction in interactions:
         offsets = []
         offsetStrings = interaction[3].split(",")
         for offsetString in offsetStrings:
             charFrom, charTo = offsetString.split("-")
             offset = (int(charFrom), int(charTo))
             offsets.append(offset)
         for k,v in self.tokensById.iteritems():
             for offset in offsets:
                 if Range.overlap(offset, v.charOffset):
                     v.interactionWords.append(interaction[4])
                     interactionTokens.append(v.id)
     return interactionTokens
Exemplo n.º 47
0
def removeNamedEntityPhrases(entities, phrases, phraseDict):
    neOffsets = set()
    for entity in entities:
        if entity.get("isName") != "True":
            continue
        neOffsets.add(entity.get("charOffset"))
    phrasesToKeep = []
    for phrase in phrases:
        phraseOffset = phrase.get("charOffset")
        if phraseOffset in neOffsets:
            phraseOffsetTuple = Range.charOffsetToSingleTuple(phraseOffset)
            if phraseOffsetTuple in phraseDict:
                del phraseDict[phraseOffsetTuple]
        else:
            phrasesToKeep.append(phrase)
    #print >> sys.stderr, "Removed", len(phrases) - len(phrasesToKeep), "named entity phrases"
    return phrasesToKeep
Exemplo n.º 48
0
 def markNamedEntities(self, entityElements):
     """ Marks tokens belonging to named entities
     """
     namedEntityTokens = []
     for entityElement in entityElements:
         offsets = []
         offsetStrings = entityElement.attrib["charOffset"].split(",")
         for offsetString in offsetStrings:
             charFrom, charTo = offsetString.split("-")
             offset = (int(charFrom), int(charTo))
             offsets.append(offset)
         for k, v in self.tokensById.iteritems():
             for offset in offsets:
                 if Range.overlap(offset, v.charOffset):
                     v.entities.append(entityElement.attrib["id"])
                     namedEntityTokens.append(v.id)
     return namedEntityTokens
Exemplo n.º 49
0
 def markNamedEntities(self, entityElements):
     """ Marks tokens belonging to named entities
     """
     namedEntityTokens = []
     for entityElement in entityElements:
         offsets = []
         offsetStrings = entityElement.attrib["charOffset"].split(",")
         for offsetString in offsetStrings:
             charFrom, charTo = offsetString.split("-")
             offset = (int(charFrom), int(charTo))
             offsets.append(offset)
         for k,v in self.tokensById.iteritems():
             for offset in offsets:
                 if Range.overlap(offset, v.charOffset):
                     v.entities.append(entityElement.attrib["id"])
                     namedEntityTokens.append(v.id)
     return namedEntityTokens
Exemplo n.º 50
0
 def markBioInferInteractions(self, interactions):
     """ Marks tokens belonging to a BioInfer interaction
     """
     interactionTokens = []
     for interaction in interactions:
         offsets = []
         offsetStrings = interaction[3].split(",")
         for offsetString in offsetStrings:
             charFrom, charTo = offsetString.split("-")
             offset = (int(charFrom), int(charTo))
             offsets.append(offset)
         for k, v in self.tokensById.iteritems():
             for offset in offsets:
                 if Range.overlap(offset, v.charOffset):
                     v.interactionWords.append(interaction[4])
                     interactionTokens.append(v.id)
     return interactionTokens
Exemplo n.º 51
0
def removeNamedEntityPhrases(entities, phrases, phraseDict):
    neOffsets = set()
    for entity in entities:
        if entity.get("isName") != "True":
            continue
        neOffsets.add(entity.get("charOffset"))
    phrasesToKeep = []
    for phrase in phrases:
        phraseOffset = phrase.get("charOffset")
        if phraseOffset in neOffsets:
            phraseOffsetTuple = Range.charOffsetToSingleTuple(phraseOffset)
            if phraseOffsetTuple in phraseDict:
                del phraseDict[phraseOffsetTuple]
        else:
            phrasesToKeep.append(phrase)
    #print >> sys.stderr, "Removed", len(phrases) - len(phrasesToKeep), "named entity phrases"
    return phrasesToKeep
Exemplo n.º 52
0
    def toElement(self, sentenceId, entitiesById):
        entityElement = ET.Element("entity")
        entityElement.attrib["origId"] = self.id
        entityElement.attrib["id"] = sentenceId + ".e" + str(len(entitiesById))
        entitiesById[self.id] = entityElement
        entityElement.attrib["type"] = self.sem
        if self.sem == "gene-or-gene-product":
            entityElement.attrib["isName"] = "True"
        else:
            entityElement.attrib["isName"] = "False"
        entityElement.attrib["charOffset"] = Range.tuplesToCharOffset(self.charOffset)
        #entityElement.attrib["headOffset"] = entityElement.attrib["charOffset"]
#        if self.headToken != None:
#            entityElement.attrib["headOffset"] = Range.tuplesToCharOffset(self.headToken.charOffset)
#        else:
#            entityElement.attrib["headOffset"] = interactionWordElement.attrib["charOffset"]
        entityElement.attrib["text"] = self.text
        self.InteractionXMLelement = entityElement
Exemplo n.º 53
0
    def main(self):
        independent_col = [1,2,5]
        dep_col = 8
        variance_of_bins = []
        all_bins = []
        for i in independent_col:
            content = self.take_independent_dependent_columns([i,8])
            sorted_content = sorted(content)
            r = Range.main(sorted_content,self.first_value)
            breaks = Superrange.main(r,self.second_value)
            len_breaks =len(breaks)
            bins = self.return_splitted_list(content,breaks)
            all_bins.append(bins)

        for bin in all_bins:
            var = self.bin_var(bin)
            variance_of_bins.append(var)
        final_tree = self.create_tree(all_bins,variance_of_bins)
Exemplo n.º 54
0
 def toElement(self, sentenceId, entitiesById):
     entityElement = ET.Element("entity")
     entityElement.attrib["origId"] = self.id
     entityElement.attrib["id"] = sentenceId + ".e" + str(len(entitiesById))
     entitiesById[self.id] = entityElement
     entityElement.attrib["type"] = self.sem
     if self.sem == "gene-or-gene-product":
         entityElement.attrib["isName"] = "True"
     else:
         entityElement.attrib["isName"] = "False"
     entityElement.attrib["charOffset"] = Range.tuplesToCharOffset(
         self.charOffset)
     #entityElement.attrib["headOffset"] = entityElement.attrib["charOffset"]
     #        if self.headToken != None:
     #            entityElement.attrib["headOffset"] = Range.tuplesToCharOffset(self.headToken.charOffset)
     #        else:
     #            entityElement.attrib["headOffset"] = interactionWordElement.attrib["charOffset"]
     entityElement.attrib["text"] = self.text
     self.InteractionXMLelement = entityElement
Exemplo n.º 55
0
    def writeXMLSentence(self, examples, predictionsByExample, sentenceObject, classSet, classIds, goldSentence=None):        
        self.assertSameSentence(examples)
        
        sentenceElement = sentenceObject.sentence
        sentenceId = sentenceElement.get("id")
        sentenceText = sentenceElement.get("text")
        # detach analyses-element
        sentenceAnalysesElement = None
        sentenceAnalysesElement = sentenceElement.find("sentenceanalyses")
        if sentenceAnalysesElement != None:
            sentenceElement.remove(sentenceAnalysesElement)
        # remove pairs and interactions
        interactions = self.removeChildren(sentenceElement, ["pair", "interaction"])
        # remove entities
        newEntityIdCount = IDUtils.getNextFreeId(sentenceElement.findall("entity"))
        nonNameEntities = self.removeNonNameEntities(sentenceElement)
            
        # add new pairs
        for example in examples:
            prediction = predictionsByExample[example[0]]
            entityElement = ET.Element("entity")
            entityElement.attrib["isName"] = "False"
            headToken = example[3]["t"]
            for token in sentenceObject.tokens:
                if token.get("id") == headToken:
                    headToken = token
                    break
            entityElement.set("charOffset", example[3]["charOffset"]) 
            entityElement.set("headOffset", headToken.get("charOffset"))
            entityElement.set("phraseType", example[3]["ptype"])
            entOffset = Range.charOffsetToSingleTuple(example[3]["charOffset"])
            entityElement.set("text", sentenceText[entOffset[0]:entOffset[1]+1])
            entityElement.set("id", sentenceId + ".e" + str(newEntityIdCount))
            self.setElementType(entityElement, prediction, classSet, classIds)
            newEntityIdCount += 1
            sentenceElement.append(entityElement)

        # re-attach the analyses-element
        if sentenceAnalysesElement != None:
            sentenceElement.append(sentenceAnalysesElement)
Exemplo n.º 56
0
 def findHeadToken(self, charOffsets):
     debug = False
     tokenKeys = self.tokensById.keys()
     tokenKeys.sort()
     candidateTokenIds = set()
     for charOffset in charOffsets:
         for key in tokenKeys:
             token = self.tokensById[key]
             if Range.overlap(charOffset, token.charOffset):
                 candidateTokenIds.add(token.id)
                 #if token.text == "Leukotriene":
                 #    debug = True
     candidateTokenIds = list(candidateTokenIds)
     candidateTokenIds.sort()
     
     tokenHeadScores = self.scoreTokens()
     
     #if debug:
     #    print "Tokens:", candidateTokenIds
     #    print "Scores:", tokenScores
     
     if len(candidateTokenIds) == 0:
         return None
     
     highestScore = -9999999
     bestTokens = []
     for i in candidateTokenIds:
         if tokenHeadScores[i] > highestScore:
             highestScore = tokenHeadScores[i]
     for i in range(len(candidateTokenIds)):
         if tokenHeadScores[candidateTokenIds[i]] == highestScore:
             bestTokens.append(candidateTokenIds[i])
     if debug:
         print "tokens:"
         for i in range(len(candidateTokenIds)):
             print "[", candidateTokenIds[i], self.tokensById[candidateTokenIds[i]].text, tokenHeadScores[candidateTokenIds[i]], "]"
     return self.tokensById[bestTokens[-1]]
     assert(False)    
Exemplo n.º 57
0
    def interactionWordToElement(self, sentenceId, entitiesById):   
        interactionWordElement = None
#        for entity in entitiesById.values():
#            if entity.attrib["charOffset"] == Range.tuplesToCharOffset(self.clueTypeCharOffsets):
#                interactionWordElement = entity
#                interactionWordElement.attrib["type"] = self.type
#                interactionWordElement.attrib["isName"] = "False"
#                break
        if interactionWordElement == None:
            interactionWordElement = ET.Element("entity")
            interactionWordElement.attrib["origId"] = self.id
            interactionWordElement.attrib["type"] = self.type
            interactionWordElement.attrib["isName"] = "False"
            interactionWordElement.attrib["charOffset"] = Range.tuplesToCharOffset(self.clueTypeCharOffsets)
            if interactionWordElement.attrib["charOffset"] == "":
                return
#            if self.headToken != None:
#                interactionWordElement.attrib["headOffset"] = Range.tuplesToCharOffset(self.headToken.charOffset)
#            else:
#                interactionWordElement.attrib["headOffset"] = interactionWordElement.attrib["charOffset"]
            interactionWordElement.attrib["text"] = str(self.clueTypeTexts)
            interactionWordElement.attrib["id"] = sentenceId + ".e" + str(len(entitiesById))
            entitiesById[self.id] = interactionWordElement
Exemplo n.º 58
0
def mapTriggers(sourceLines, targetLines, options):
    sourceSplits = []
    triggerMap = {}
    firstTriggerLine = True
    for sourceLine in sourceLines:
        if sourceLine[0] != "T":
            continue
        sourceSplit = sourceLine.split()
        if firstTriggerLine:
            for i in range(1,int(sourceSplit[0][1:])):
                triggerMap["T"+str(i)] = "T"+str(i)
            firstTriggerLine = False
        sourceSplits.append(sourceSplit)
    
    matchTypes = {}
    for targetLine in targetLines:
        if targetLine[0] != "T":
            continue
        splits = targetLine.split()
        triggerMap[splits[0]] = None
        targetOffset = (int(splits[2]), int(splits[3]))
        for i in range(len(sourceSplits)):
            if splits[1] == sourceSplits[i][1]:
                sourceOffset = (int(sourceSplits[i][2]), int(sourceSplits[i][3]))
                if Range.overlap(sourceOffset, targetOffset):
                    matchType = "overlap"
                    if sourceOffset == targetOffset:
                        matchType = "exact"

                    if triggerMap[splits[0]] == None or (matchTypes[splits[0]] == "overlap" and matchType == "exact"):
                        triggerMap[splits[0]] = sourceSplits[i][0]
                        matchTypes[splits[0]] = matchType
                        
        if triggerMap[splits[0]] == None:
            if options.verbose:
                print >> sys.stderr, "  Trigger not found:", splits[0]
    return triggerMap