def initDomainLevelProteins(domainfile): handle = open(domainfile, 'r') proteinsA = {} proteinsB = {} orthologGroups = {} groupsStarted = False ort = None lineStarts = ['Group', 'Score', 'Boots', '_____'] #header, protein, start, end for line in handle.readlines(): if groupsStarted: if line[0:5] not in lineStarts: hasA = not line.startswith(' ') splittedLine = line.split() temp = ort.getBasicProteins(splittedLine) for p in temp: p.__class__ = DomainLevelProtein if hasA: splittedHeader = Helper.retrieveDomainHeaderInformation( splittedLine[0]) temp[0].domain = splittedHeader[1] temp[0].start = int(splittedHeader[2]) temp[0].end = int(splittedHeader[3]) temp[0].header = splittedLine[0] proteinsA[temp[0].header] = temp[0] score = float(splittedLine[1].split('%')[0]) ort.inparalogsA[temp[0].header] = score if not hasA or len(temp) > 1: splittedHeader = Helper.retrieveDomainHeaderInformation( splittedLine[-2]) temp[-1].domain = splittedHeader[1] temp[-1].start = int(splittedHeader[2]) temp[-1].end = int(splittedHeader[3]) temp[-1].header = splittedLine[-2] proteinsB[temp[-1].header] = temp[-1] score = float(splittedLine[-1].split('%')[0]) ort.inparalogsB[temp[-1].header] = score elif line.startswith('Group'): ort = OrthologyGroup.getBasicOrthologyGroup( line, False, orthologGroups) elif line.startswith('Bootstrap'): ort.addSeeds(line) else: if line.startswith('_'): groupsStarted = True handle.close() return proteinsA, proteinsB, orthologGroups
def initDomainLevelProteins(domainfile): handle = open(domainfile, 'r') proteinsA = {} proteinsB = {} orthologGroups = {} groupsStarted = False ort = None lineStarts = ['Group', 'Score', 'Boots', '_____'] #header, protein, start, end for line in handle.readlines(): if groupsStarted: if line[0:5] not in lineStarts: hasA = not line.startswith(' ') splittedLine = line.split() temp = ort.getBasicProteins(splittedLine) for p in temp: p.__class__ = DomainLevelProtein if hasA: splittedHeader = Helper.retrieveDomainHeaderInformation(splittedLine[0]) temp[0].domain = splittedHeader[1] temp[0].start = int(splittedHeader[2]) temp[0].end = int(splittedHeader[3]) temp[0].header = splittedLine[0] proteinsA[temp[0].header] = temp[0] score = float(splittedLine[1].split('%')[0]) ort.inparalogsA[temp[0].header] = score if not hasA or len(temp) > 1: splittedHeader = Helper.retrieveDomainHeaderInformation(splittedLine[-2]) temp[-1].domain = splittedHeader[1] temp[-1].start = int(splittedHeader[2]) temp[-1].end = int(splittedHeader[3]) temp[-1].header = splittedLine[-2] proteinsB[temp[-1].header] = temp[-1] score = float(splittedLine[-1].split('%')[0]) ort.inparalogsB[temp[-1].header] = score elif line.startswith('Group'): ort = OrthologyGroup.getBasicOrthologyGroup(line, False, orthologGroups) elif line.startswith('Bootstrap'): ort.addSeeds(line) else: if line.startswith('_'): groupsStarted = True handle.close() return proteinsA, proteinsB, orthologGroups
def initGeneLevelProteins(filename, tsvfileA, tsvfileB, useDomains): proteinsA = {} proteinsB = {} orthologGroups = {} groupsStarted = False rcp = ConfigParser.RawConfigParser() rcp.read("orthology.cfg") cutoff = rcp.getint("Options", "domainlengthcutoff") if useDomains: domainsA, shortA = Helper.getDomainsFromTsv(tsvfileA, cutoff) domainsB, shortB = Helper.getDomainsFromTsv(tsvfileB, cutoff) handle = open(filename, "r") ort = None lineStarts = ["Group", "Score", "Boots", "_____"] for line in handle.readlines(): if groupsStarted: if line[0:5] not in lineStarts: hasA = not line.startswith(" ") temp = [] splittedLine = line.split() temp = ort.getBasicProteins(splittedLine) if hasA: temp[0].__class__ = GeneLevelProtein proteinsA[temp[0].accession] = temp[0] if useDomains: temp[0].domains = domainsA[temp[0].accession] score = float(splittedLine[1].split("%")[0]) ort.inparalogsA[temp[0].accession] = score if not hasA or len(temp) > 1: temp[-1].__class__ = GeneLevelProtein proteinsB[temp[-1].accession] = temp[-1] if useDomains: temp[-1].domains = domainsB[temp[-1].accession] score = float(splittedLine[-1].split("%")[0]) ort.inparalogsB[temp[-1].accession] = score elif line.startswith("Group"): ort = OrthologyGroup.getBasicOrthologyGroup(line, True, orthologGroups) elif line.startswith("Bootstrap"): ort.addSeeds(line) else: if line.startswith("_"): groupsStarted = True pairsCount = 0 for g in orthologGroups: pairsCount += len(orthologGroups[g].inparalogsA) * len(orthologGroups[g].inparalogsB) print pairsCount, "should be the amount of pairs" print len(orthologGroups), "ortholog groups read from the file" handle.close() if useDomains: return proteinsA, proteinsB, orthologGroups, shortA, shortB else: return proteinsA, proteinsB, orthologGroups