Пример #1
0
    def __init__(self, filename):

        self.orgmir2mimat = defaultdict(set)

        self.mimat2mi = {}
        self.mimat2orgmi = {}
        self.mimat2orgmir = {}

        self.mi2orgmi = defaultdict(set)
        self.mi2orgmir = defaultdict(set)
        self.orgmi2mi = defaultdict(set)
        self.orgmir2mi = defaultdict(set)

        allRelations = DataFrame.parseFromFile(filename,
                                               bConvertTextToNumber=False)

        for row in allRelations:

            mimat = row['MIMAT']
            orgmir = row['ORGMIR']
            mi = row['MI']
            orgmi = row['ORGMI']

            self.mimat2mi[mimat] = mi
            self.mimat2orgmir[mimat] = orgmir
            self.mimat2orgmi[mimat] = orgmi
            self.orgmir2mimat[orgmir].add(mimat)

            self.mi2orgmi[mi].add(orgmi)
            self.mi2orgmir[mi].add(orgmir)
            self.orgmir2mi[orgmir].add(mi)
            self.orgmi2mi[orgmi].add(mi)
Пример #2
0
from collections import Counter, defaultdict

from porestat.utils.DataFrame import DataFrame
from utils.idutils import ltype2label, makeDBGeneID, mirtarbase_exp_type, mirtarbase_function_label, speciesName2TaxID, \
    dataDir
from database.Neo4JInterface import neo4jInterface
from utils.parallel import MapReduce

mirtarbaseEvidences = DataFrame.parseFromFile(dataDir + "/miRExplore/miRTarBase.csv", bConvertTextToNumber=False)

print(mirtarbaseEvidences.getHeader())

experimentTypes = Counter()
supportTypes = Counter()
referencesWithComma = Counter()

db = neo4jInterface(simulate=False, printQueries=False)
db.deleteRelationship('n', ['GENE'], None, 'm', ['MIRTARBASE'], None, ['GENE_MENTION'], None, 'r')
db.deleteRelationship('n', ['MIRTARBASE'], None, 'm', ['MIRNA'], None, ['MIRNA_MENTION'], None, 'r')
db.deleteRelationship('n', ['MIRTARBASE'], None, 'm', ['PUBMED'], None, ['MIRTARBASE_LITERATURE_SUPPORT'], None, 'r')
db.deleteRelationship('n', ['MIRTARBASE_SUPPORT'], None, 'm', ['MIRTARBASE'], None, ['MIRTARBASE_FUNCTIONAL_SUPPORT'], None, 'r')
db.deleteRelationship('n', ['MIRTARBASE_EXPERIMENT'], None, 'm', ['MIRTARBASE'], None, ['MIRTARBASE_EXPERIMENT_SUPPORT'], None, 'r')
db.deleteRelationship('n', ['MIRTARBASE'], None, 'm', ['TAX'], None, ['ORGANISM_SUPPORT'], None, 'r')

db.deleteNode(["MIRTARBASE"], None)
db.deleteNode(["MIRTARBASE_SUPPORT"], None)
db.deleteNode(["MIRTARBASE_EXPERIMENT"], None)
db.createUniquenessConstraint('MIRTARBASE', 'id')

if False:
    db.close()
Пример #3
0
                        help='alignment files')
    parser.add_argument('-o',
                        '--output',
                        type=str,
                        required=False,
                        help="output base")

    args = parser.parse_args()

    if args.output == None:
        args.output = args.summary.name

    indf = DataFrame.parseFromFile(args.summary.name,
                                   skipChar='#',
                                   replacements={
                                       "None": None,
                                       "": None,
                                       "NA": None
                                   })

    allStatus = []

    allCols = indf.getHeader()
    allCols.remove("Status")

    for row in indf:
        allStatus.append(row["Status"])

    sampleData = defaultdict(lambda: dict())

    for row in indf:
Пример #4
0
from collections import defaultdict

from porestat.utils.DataFrame import DataFrame
from database.MIRFamily import MIRFamilyDB
from neo4j.v1 import GraphDatabase, basic_auth

from database.Neo4JInterface import neo4jInterface
from database.ORGMIRs import ORGMIRDB
from synonymes.mirnaID import miRNA, miRNAPART
from utils.idutils import dataDir

mirbase = DataFrame.parseFromFile(dataDir + "/miRExplore/mirnas_mirbase.csv",
                                  bConvertTextToNumber=False)
filename = dataDir + "/miRExplore/miFam.dat"
familyDB = MIRFamilyDB(filename)

print(mirbase.getHeader())

db = neo4jInterface(simulate=False, printQueries=False)

db.deleteRelationship('n', None, None, 'm', None, None, ['IS_ORG_MI'], None,
                      'r')
db.deleteRelationship('n', None, None, 'm', None, None, ['IS_ORG_MIR'], None,
                      'r')
db.deleteRelationship('n', None, None, 'm', None, None, ['IS_ORG_MIRNA'], None,
                      'r')

db.deleteRelationship('n', ['MIRNA'], None, 'm', ['MIRNA_PRE'], None,
                      ['MIRNA_MATURE_OF'], None, 'r')

db.deleteRelationship('n', ['MIRNA'], None, 'm', ['MIRNA_FAMILY'], None,
Пример #5
0
            "hgnc2sym2ens2uniprot") as fin:

        for line in fin:
            line = line.strip().split("\t")

            sym = line[0]
            approvSym = line[3]

            sym2approvSym[sym] = approvSym

    availSets = {}

    setDF = DataFrame.parseFromFile(args.sets,
                                    skipChar='#',
                                    replacements={
                                        "None": None,
                                        "": None,
                                        "NA": None
                                    })

    allSetGenes = set()
    for row in setDF:
        availSets[row['set_id']] = (row["set_descr"],
                                    set([
                                        sym2approvSym[x.strip()]
                                        for x in row["genes"].split(";")
                                        if x.strip() in sym2approvSym
                                    ]))
        allSetGenes = allSetGenes.union(availSets[row['set_id']][1])

    print("Got", len(availSets), "sets with a total of", len(allSetGenes),
Пример #6
0
import gzip
import os

from collections import defaultdict
from porestat.utils.DataFrame import DataFrame

from utils.idutils import miRExploreDir

hgncData = DataFrame.parseFromFile(miRExploreDir + "/hgnc.tsv")

allUniprotIDs = set()
for row in hgncData:

    uniprotVals = row['UniProt ID(supplied by UniProt)']

    if uniprotVals == None:
        continue

    uniprotVals = uniprotVals.strip()
    uniprotIDs = uniprotVals.split(', ')

    for x in uniprotIDs:
        allUniprotIDs.add(x)

print(len(allUniprotIDs))
allUniprotIDs = sorted(allUniprotIDs)

uniprot2ipr = defaultdict(set)
neededUniprotIDs = miRExploreDir + "/interpro/relevant.uniprot.list"

# zgrep -f relevant.uniprot.list > relevant.uniprot.ipr.list
Пример #7
0
    parser.add_argument('-s2', '--samples', nargs='+', type=str, default=[])

    parser.add_argument('-pc',
                        '--prefix-counts',
                        dest="prefix_counts",
                        action='store_true',
                        default=False,
                        help="run FC part")

    args = parser.parse_args()

    indf1 = DataFrame.parseFromFile(args.de1.name,
                                    skipChar='#',
                                    replacements={
                                        "None": None,
                                        "": None,
                                        "NA": None
                                    })

    indf2 = DataFrame.parseFromFile(args.de2.name,
                                    skipChar='#',
                                    replacements={
                                        "None": None,
                                        "": None,
                                        "NA": None
                                    })

    allSamples = args.samples
    print("all samples", allSamples)
Пример #8
0
from Bio import Entrez
from porestat.utils.DataFrame import DataFrame

from utils.idutils import miRExploreDir


dbData = DataFrame.parseFromFile(miRExploreDir + "/miR2Disease/AllEntries.txt", ['mirna', 'disease', 'effect', 'measurement', 'year', 'title'], bConvertTextToNumber=False)

pmidTitleIdx = dbData.getColumnIndex('title')

allTitles = []

for row in dbData:

    title = row['title']

    if title == None:
        continue

    title = title.strip()

    if not title[-1] == '.':
        title += "."

    allTitles.append(title)

allTitles = list(set(allTitles))

print(len(allTitles))

titlesSearch = []
Пример #9
0
    parser.add_argument("-p", "--prefixes", nargs='+', type=str, required=True)
    parser.add_argument('-s2', '--samples', nargs='+', type=str, default=[])

    parser.add_argument('-pc',
                        '--prefix-counts',
                        dest="prefix_counts",
                        action='store_true',
                        default=False,
                        help="run FC part")

    args = parser.parse_args()

    curDF = DataFrame.parseFromFile(args.de[0].name,
                                    skipChar='#',
                                    replacements={
                                        "None": None,
                                        "": None,
                                        "NA": None
                                    })

    for didx, deTable in enumerate(args.de):

        if didx == 0:
            continue

        indf2 = DataFrame.parseFromFile(deTable.name,
                                        skipChar='#',
                                        replacements={
                                            "None": None,
                                            "": None,
                                            "NA": None
Пример #10
0
    parser.add_argument('-p', '--pval', type=float, default=0.05)
    parser.add_argument('-t', '--tools', nargs='+')

    parser.add_argument('-o', '--output', type=argparse.FileType("w"), required=True)

    #parser.add_argument('-g', '--gene', type=str, required=True, help="gene id column name")


    args = parser.parse_args()


    for fidx, defile in enumerate(args.counts):
        indf = DataFrame.parseFromFile(defile.name, skipChar='#', replacements = {
            "None": None,
            "": None,
            "NA": None
        })

        inHeaders = indf.getHeader()

        #if not args.gene in inHeaders:
        #    print("Unknown gene id column", args.gene)
        #    print(inHeaders)
        #    exit(-1)

        allconditions = []

        for conditions in args.conditions:

            for condition in conditions:
Пример #11
0
    #parser.add_argument('-c', '--cutoff', type=float, help='alignment files', default=0.05)
    parser.add_argument('-minfc',
                        '--min-foldchange',
                        type=float,
                        default=1.0,
                        required=False)
    parser.add_argument('-minpval',
                        '--min-pvalue',
                        type=float,
                        default=0.05,
                        required=False)

    args = parser.parse_args()

    for fidx, defile in enumerate(args.de):
        indf = DataFrame.parseFromFile(defile.name)

        availMethods = set()

        headername2idx = {}

        indfHeader = indf.getHeader()
        genesymname = None

        if "gene_symbol" in indfHeader:
            genesymname = "gene_symbol"
        elif "Geneid" in indfHeader:
            genesymname = "Geneid"
        else:
            genesymname = "id"
Пример #12
0
from collections import Counter
from collections import defaultdict

from porestat.utils.DataFrame import DataFrame, ExportTYPE

interactions = DataFrame.parseFromFile(
    "/home/users/joppich/ownCloud/data/chemokines_sfb/chemokine_interactions.tsv"
)

#interactions.export("/home/users/joppich/ownCloud/data/chemokines_sfb/chemokine_interactions.html", ExportTYPE.HTML)

uniqueEdges = set()

chemList = [
    'CXCR2',
    'CCL9',
    'CXCL5',
    'CXCL1',
    'CXCL13',
    'CXCL7',
    'CCL2',
    'CXCL9',
    'CCL3',
    'CXCL10',
    'CCL22',
    'CCR5',
    'CCR7',
    'CCL7',
    'CCL4',
    'CXCR4',
    'CX3CL1',
Пример #13
0
from collections import Counter
from neo4j.v1 import GraphDatabase, basic_auth
from porestat.utils.DataFrame import DataFrame
from utils.idutils import ltype2label, makeDBGeneID, dataDir
from database.Neo4JInterface import neo4jInterface

hgncGenes = DataFrame.parseFromFile(dataDir +
                                    "/miRExplore/hgnc_ensembl_entrez.tsv",
                                    bConvertTextToNumber=False)

allStatus = Counter()

db = neo4jInterface(simulate=False)
db.createUniquenessConstraint('GENE', 'id')

db.deleteRelationship('n', None, None, 'm', None, None, ['HAS_GENE'], None,
                      'r')
db.deleteNode(["GENE"], None)

for gene in hgncGenes:

    hgncID = gene['HGNC ID']
    hgncSym = gene['Approved Symbol']

    hgncName = gene['Approved Name']
    hgncEnsembl = gene['Ensembl ID(supplied by Ensembl)']
    hgncEntrez = gene['Entrez Gene ID(supplied by NCBI)']

    hgncStatus = gene['Status']
    hgncLocusType = gene['Locus Type']
Пример #14
0
import json
from collections import Counter
from collections import defaultdict

from porestat.utils.DataFrame import DataFrame, ExportTYPE

base = "/mnt/c/ownCloud/data/bcn/"
inputFile = base + "all/CV-IPN-Endothelial cell activation1.0.sif"
inputFile = base + "manual/CV-IPN-Endothelial cell activation1.0.jgf"

allNodesLabels = {}
allEdges = []
uniqueNodes = set()

if inputFile.endswith('.sif'):
    interactions = DataFrame.parseFromFile(inputFile,
                                           ['source', 'interaction', 'target'])

    for row in interactions:

        src = row['source']
        dst = row['target']

        if src is None or dst is None:
            continue

        uniqueNodes.add(src)
        uniqueNodes.add(dst)

    for row in interactions:
        src = row['source']
        dst = row['target']
Пример #15
0
from collections import defaultdict

import editdistance
from nertoolkit.geneontology.GeneOntology import GeneOntology
from porestat.utils.DataFrame import DataFrame

from utils.idutils import miRExploreDir

dbData = DataFrame.parseFromFile(miRExploreDir + "/miR2Disease/mirna_disease.tsv", bConvertTextToNumber=False)

allDiseases = set()
for row in dbData:

    disease = row['disease']

    if disease == 'None':
        continue

    allDiseases.add(disease.upper())

print(len(allDiseases))

diseaseObo = GeneOntology(miRExploreDir + "/doid.obo")

disease2obo = defaultdict(set)

"""

find perfect matches

"""
Пример #16
0
                        nargs='+',
                        type=str,
                        required=False,
                        help="output base")
    parser.add_argument('-p', '--pathname', action="store_true", default=False)

    args = parser.parse_args()

    if args.output == None:
        args.output = [counts.name for counts in args.counts]

    for fidx, defile in enumerate(args.counts):
        indf = DataFrame.parseFromFile(defile.name,
                                       skipChar='#',
                                       replacements={
                                           "None": None,
                                           "": None,
                                           "NA": None
                                       })

        inHeaders = indf.getHeader()

        #if not args.gene in inHeaders:
        #    print("Unknown gene id column", args.gene)
        #    print(inHeaders)
        #    exit(-1)

        for conditions in args.conditions:

            for condition in conditions:
Пример #17
0
from urllib import request

from porestat.utils.DataFrame import DataFrame

allgenomes = DataFrame.parseFromFile("../../../ena_bacteria_list.csv")
print(allgenomes.getHeader())

for row in allgenomes:

    protInfo = row['proteins']
    print(protInfo)

for row in allgenomes:

    protInfo = row['proteins']

    if protInfo == None or len(protInfo) == 0 or len(
            protInfo.strip()) == 0 or protInfo == 'n/a' or protInfo == 'None':
        continue

    downloadFile = row['seqID'] + ".gb"
    downloadLocation = "../../../genomes/"
    print(downloadFile)

    request.urlretrieve(
        "http://www.ebi.ac.uk/ena/data/view/" + row['seqID'] +
        "&display=txt&expanded=true", downloadLocation + "/" + downloadFile)