Пример #1
0
def sgd_connection(gene, p_dir, l_dir):
    # load gene phenotype data from SGD database
    service = Service(
        'https://yeastmine.yeastgenome.org:443/yeastmine/service')
    a = service.new_query('Gene')
    view_list = [
        'primaryIdentifier', 'symbol', 'secondaryIdentifier', 'sgdAlias',
        'qualifier', 'phenotypes.experimentType', 'phenotypes.mutantType',
        'phenotypes.observable', 'phenotypes.qualifier', 'phenotypes.allele',
        'phenotypes.alleleComment', 'phenotypes.strainBackground',
        'phenotypes.chemical', 'phenotypes.condition', 'phenotypes.details',
        'phenotypes.reporter', 'phenotypes.publications.pubMedId',
        'phenotypes.publications.citation'
    ]
    for item in view_list:
        a.add_view(item)
    a.add_constraint('organism.shortName', '=', 'S. cerevisiae', code='B')
    a.add_constraint('Gene', 'LOOKUP', gene, code='A')
    phenotype_line = 'Gene Primary DBID\tGene Standard Name\tGene Systematic Name\t' \
                     'Gene Sgd Alias\tGene Qualifier\tPhenotypes Experiment Type\t' \
                     'Phenotypes Mutant Type\tPhenotypes Observable\tPhenotypes Qualifier\t' \
                     'Phenotypes Allele\tPhenotypes Allele Comment\tPhenotypes Strain Background\t' \
                     'Phenotypes Chemical\tPhenotypes Condition\tPhenotypes Details\t' \
                     'Phenotypes Reporter\tPublications PubMed ID\tPublications Citation\n'
    p_result_file = os.path.join(p_dir, '{0}.txt'.format(gene))
    with open(p_result_file, 'w', encoding='utf-8') as f1:
        for row in a.rows():
            result_line = ''
            for k in view_list:
                result_line += '{0}\t'.format(str(row[k]))
            phenotype_line += result_line.strip() + '\n'
        f1.write(phenotype_line)
    # Load phenotype summary
    b = service.new_query('Gene')
    b.add_view('phenotypes.genes.phenotypeSummary')
    b.add_constraint('organism.shortName', '=', 'S. cerevisiae', code='B')
    b.add_constraint('Gene', 'LOOKUP', gene, code='A')
    summary = ''
    for row in b.rows():
        p_result = row['phenotypes.genes.phenotypeSummary']
        if p_result:
            summary += p_result
    result_list = [gene, summary]
    # Load PubMed id
    c = service.new_query('Gene')
    c.add_view('publicationAnnotations.publication.pubMedId')
    c.add_constraint('organism.shortName', '=', 'S. cerevisiae', code='B')
    c.add_constraint('Gene', 'LOOKUP', gene, code='A')
    l_result_file = os.path.join(l_dir, '{0}.txt'.format(gene))
    with open(l_result_file, 'w', encoding='utf-8') as f2:
        for row in c.rows():
            pubmed_id = row['publicationAnnotations.publication.pubMedId']
            if pubmed_id:
                handle = pubmed_connection(pubmed_id, gene)
                if handle:
                    f2.write(handle.read())
    return result_list
Пример #2
0
def get_gene_id(gene_name):
    '''Retrieve systematic yeast gene name from the common name.

    :param gene_name: Common name for yeast gene (e.g. ADE2).
    :type gene_name: str
    :returns: Systematic name for yeast gene (e.g. YOR128C).
    :rtype: str

    '''
    from intermine.webservice import Service

    service = Service('http://yeastmine.yeastgenome.org/yeastmine/service')

    # Get a new query on the class (table) you will be querying:
    query = service.new_query('Gene')

    # The view specifies the output columns
    query.add_view('primaryIdentifier', 'secondaryIdentifier', 'symbol',
                   'name', 'sgdAlias', 'crossReferences.identifier',
                   'crossReferences.source.name')

    # Uncomment and edit the line below (the default) to select a custom sort
    # order:
    # query.add_sort_order('Gene.primaryIdentifier', 'ASC')

    # You can edit the constraint values below
    query.add_constraint('organism.shortName', '=', 'S. cerevisiae', code='B')
    query.add_constraint('Gene', 'LOOKUP', gene_name, code='A')

    # Uncomment and edit the code below to specify your own custom logic:
    # query.set_logic('A and B')

    for row in query.rows():
        gid = row['secondaryIdentifier']
    return gid
Пример #3
0
def get_gene_id(gene_name):
    """Retrieve systematic yeast gene name from the common name.

    :param gene_name: Common name for yeast gene (e.g. ADE2).
    :type gene_name: str
    :returns: Systematic name for yeast gene (e.g. YOR128C).
    :rtype: str

    """
    service = Service("http://yeastmine.yeastgenome.org/yeastmine/service")

    # Get a new query on the class (table) you will be querying:
    query = service.new_query("Gene")

    # The view specifies the output columns
    query.add_view("primaryIdentifier", "secondaryIdentifier", "symbol",
                   "name", "sgdAlias", "crossReferences.identifier",
                   "crossReferences.source.name")

    # Uncomment and edit the line below (the default) to select a custom sort
    # order:
    # query.add_sort_order("Gene.primaryIdentifier", "ASC")

    # You can edit the constraint values below
    query.add_constraint("organism.shortName", "=", "S. cerevisiae", code="B")
    query.add_constraint("Gene", "LOOKUP", gene_name, code="A")

    # Uncomment and edit the code below to specify your own custom logic:
    # query.set_logic("A and B")

    for row in query.rows():
        gid = row["secondaryIdentifier"]
    return gid
Пример #4
0
def get_all_gene_annotations():
    service = Service(
        "https://yeastmine.yeastgenome.org:443/yeastmine/service")
    query = service.new_query("Gene")
    col_names = [
        "briefDescription", "description", "functionSummary",
        "chromosome.primaryIdentifier", "secondaryIdentifier", "symbol",
        "phenotypeSummary", "locations.strand", "locations.end",
        "locations.start"
    ]
    query.add_view(col_names)
    seen_orfs = set()
    col_dicts = {c: [] for c in col_names}
    for row in query.rows():
        # for some reason rows are repeated in the yeastmine output, so I deduplicate them here
        if row['secondaryIdentifier'] not in seen_orfs:
            for c in col_names:
                col_dicts[c].append(row[c])
            seen_orfs.add(row['secondaryIdentifier'])
    name_shortener = {
        'chromosome.primaryIdentifier': 'chromosome',
        'secondaryIdentifier': 'ORF',
        'symbol': 'Gene',
        'locations.start': 'start',
        'locations.end': 'end',
        'locations.strand': 'orf_strand'
    }
    td = pd.DataFrame(col_dicts).rename(columns=name_shortener)
    td['Gene_ORF'] = td.apply(lambda row: gene_orfer(row), axis=1)
    return td
Пример #5
0
def main():
    """Connects to yeastmine and creates a dictionary of annotation data.
    Data is saved into shelve as well as returned."""
    #print("annotations.SGD.yeastmine.main:")
    service = Service("http://yeastmine.yeastgenome.org/yeastmine")

    query = service.new_query()

    query.add_view(
        "SequenceFeature.primaryIdentifier", "SequenceFeature.featureType",
        "SequenceFeature.secondaryIdentifier", "SequenceFeature.description",
        "SequenceFeature.sgdAlias", "SequenceFeature.name", "SequenceFeature.symbol",
        "SequenceFeature.chromosome.name", "SequenceFeature.chromosome.featAttribute",
        "SequenceFeature.locations.start", "SequenceFeature.locations.end", "SequenceFeature.locations.strand"
        )
    query.add_constraint("SequenceFeature.organism.name", "=", "Saccharomyces cerevisiae", "A")
    query.add_constraint("SequenceFeature.featureType", "=", "ORF", "B")
    query.set_logic("(A and B)")

    annotation = {}
    #print("settins.PROJECT_ROOT: %s" % settings.PROJECT_ROOT)
    #print("os.path.join: %s" % os.path.join(os.path.join(settings.PROJECT_ROOT, 'apps', 'annotations', 'SGD', 'yeastmine')))
    db = shelve.open(os.path.join(settings.PROJECT_ROOT, 'apps', 'annotations', 'SGD', 'yeastmine'), 'c')
    for row in query.rows():
        data = {}
        for x in xrange(0, len(row.views)):
            attribute = row.views[x].split('.')[-1]
            value = row.data[x]['value']
            if attribute == 'name' and not value: continue
            data[attribute] = value
        if 'name' not in data: data['name'] = None
        annotation[data['secondaryIdentifier']] = data
        db[str(data['secondaryIdentifier'])] = data
    db.close()
    return annotation
Пример #6
0
def intermine_query(ids, organism, *args):
    service = Service(service_urls[organism])
    query = service.new_query("Gene", case_sensitive=True)
    query.add_constraint("Gene", "LOOKUP", ids, code="A")
    query.add_constraint("organism.name", "=", organism, code="B")
    query.select(*args)
    return query
Пример #7
0
def getInteractions():
    service = Service("http://yeastmine.yeastgenome.org/yeastmine/service")

    # Get a new query on the class (table) you will be querying:
    query = service.new_query("Gene")

    # Type constraints should come early - before all mentions of the paths they constrain
    query.add_constraint("goAnnotation.ontologyTerm", "GOTerm")

    # The view specifies the output columns
    query.add_view(
        "symbol", "interactions.details.experimentType",
        "interactions.gene2.symbol", "interactions.gene2.briefDescription"
    )

    # You can edit the constraint values below
    query.add_constraint("goAnnotation.qualifier", "IS NULL", code = "C")
    query.add_constraint("goAnnotation.qualifier", "!=", "NOT", code = "B")
    query.add_constraint("goAnnotation.ontologyTerm.name", "=", "cytoplasmic translation", code = "A")
    query.add_constraint("name", "ONE OF", ["Ribosomal Protein of the Large subunit", "Ribosomal Protein of the Small subunit"], code = "D")
    query.add_constraint("interactions.details.annotationType", "=", "manually curated", code = "E")

    # Your custom constraint logic is specified with the code below:
    query.set_logic("A and (B or C) and E and D")

    
    interactions = {}
    
    for row in query.rows():
        if row["symbol"] not in interactions.keys():
            interactions[row["symbol"]] = [{ "expt" : row["interactions.details.experimentType"], "gene2": row["interactions.gene2.symbol"],"desc":row["interactions.gene2.briefDescription"]}]
        else:
            interactions[row["symbol"]].append({ "expt": row["interactions.details.experimentType"], "gene2": row["interactions.gene2.symbol"],"desc":row["interactions.gene2.briefDescription"]})
    return interactions
Пример #8
0
def wmquery():
    service = Service("http://intermine.wormbase.org/tools/wormmine/service")
    query = service.new_query("Gene")
    query.add_view(
        "biotype", "length", "symbol", "primaryIdentifier",
        "downstreamIntergenicRegion.primaryIdentifier",
        "downstreamIntergenicRegion.organism.name",
        "downstreamIntergenicRegion.locations.feature.primaryIdentifier",
        "downstreamIntergenicRegion.locations.start",
        "downstreamIntergenicRegion.locations.end",
        "downstreamIntergenicRegion.locations.strand",
        "homologues.dataSets.name",
        "upstreamIntergenicRegion.primaryIdentifier",
        "upstreamIntergenicRegion.organism.name",
        "upstreamIntergenicRegion.locations.feature.primaryIdentifier",
        "upstreamIntergenicRegion.locations.start",
        "upstreamIntergenicRegion.locations.end",
        "upstreamIntergenicRegion.locations.strand",
        "transcripts.primaryIdentifier", "transcripts.symbol")

    for row in query.rows():
        print (row["biotype"], row["length"], row["symbol"], row["primaryIdentifier"], \
            row["downstreamIntergenicRegion.primaryIdentifier"], \
            row["downstreamIntergenicRegion.organism.name"], \
            row["downstreamIntergenicRegion.locations.feature.primaryIdentifier"], \
            row["downstreamIntergenicRegion.locations.start"], \
            row["downstreamIntergenicRegion.locations.end"], \
            row["downstreamIntergenicRegion.locations.strand"], row["homologues.dataSets.name"], \
            row["upstreamIntergenicRegion.primaryIdentifier"], \
            row["upstreamIntergenicRegion.organism.name"], \
            row["upstreamIntergenicRegion.locations.feature.primaryIdentifier"], \
            row["upstreamIntergenicRegion.locations.start"], \
            row["upstreamIntergenicRegion.locations.end"], \
            row["upstreamIntergenicRegion.locations.strand"], row["transcripts.primaryIdentifier"], \
            row["transcripts.symbol"])
Пример #9
0
def get_gene_id(gene_name):
    '''Retrieve systematic yeast gene name from the common name.

    :param gene_name: Common name for yeast gene (e.g. ADE2).
    :type gene_name: str
    :returns: Systematic name for yeast gene (e.g. YOR128C).
    :rtype: str

    '''
    from intermine.webservice import Service

    service = Service('http://yeastmine.yeastgenome.org/yeastmine/service')

    # Get a new query on the class (table) you will be querying:
    query = service.new_query('Gene')

    # The view specifies the output columns
    query.add_view('primaryIdentifier', 'secondaryIdentifier', 'symbol',
                   'name', 'sgdAlias', 'crossReferences.identifier',
                   'crossReferences.source.name')

    # Uncomment and edit the line below (the default) to select a custom sort
    # order:
    # query.add_sort_order('Gene.primaryIdentifier', 'ASC')

    # You can edit the constraint values below
    query.add_constraint('organism.shortName', '=', 'S. cerevisiae', code='B')
    query.add_constraint('Gene', 'LOOKUP', gene_name, code='A')

    # Uncomment and edit the code below to specify your own custom logic:
    # query.set_logic('A and B')

    for row in query.rows():
        gid = row['secondaryIdentifier']
    return gid
Пример #10
0
def index_genes(organism, mod):
    backup_filename = organism + "mine_genes_" + time.strftime("%m_%d_%Y") + ".bkp"
    if os.path.isfile(backup_filename):
        print "Restoring fetched data from today from " + organism + "mine"

        backup = open(backup_filename, 'rb')
        genes = pickle.load(backup)
    else:
        print "Fetching data from " + organism + "mine"
        service = Service(mod["mine_service_url"])

        query = service.new_query("Gene")
        query.add_view(mod["gene_fields"].values())

        query.add_constraint("organism.name", "=", mod["mine_organism_name"], code="B")

        rows = query.rows()

        genes = {}

        for row in rows:
            id = row[mod["gene_fields"]["id"]]

            if id in genes:
                genes[id]["go_ids"].append(row[mod["gene_fields"]["go_id"]])
                genes[id]["go_names"].append(row[mod["gene_fields"]["go_name"]])
            else:
                genes[id] = {
                    "name": row[mod["gene_fields"]["gene_name"]],
                    "symbol": row[mod["gene_fields"]["gene_symbol"]],
                    "synonym": row[mod["gene_fields"]["gene_synonym"]],
                    "go_ids": [row[mod["gene_fields"]["go_id"]]],
                    "go_names": [row[mod["gene_fields"]["go_name"]]],
                    "href": mod["url_prefix"] + row["primaryIdentifier"] + mod["url_suffix"],
                    "organism": organism,
                    "category": "gene"
                }

        with open(backup_filename, 'wb') as backup:
            pickle.dump(genes, backup)

    print "Indexing " + str(len(genes)) + " " + organism + " genes"

    bulk_data = []
    for gene in genes.keys():
        bulk_data.append({
            'index': {
                '_index': INDEX_NAME,
                '_type': DOC_TYPE,
                '_id': organism + "_" + gene
            }
        })
        bulk_data.append(genes[gene])

        if len(bulk_data) % 500 == 0:
            es.bulk(index=INDEX_NAME, body=bulk_data, refresh=True)
            bulk_data = []

    if len(bulk_data) > 0:
        es.bulk(index=INDEX_NAME, body=bulk_data, refresh=True)
Пример #11
0
 def search_SGD(self, gene_code=None):
     service = Service("http://yeastmine.yeastgenome.org/yeastmine/service")
     query = service.new_query("Gene")
     query.add_view(
         "chromosome.primaryIdentifier",
         "chromosomeLocation.end",
         "chromosomeLocation.start",
         "chromosomeLocation.strand",
         "secondaryIdentifier",
     )
     query.add_constraint("symbol", "=", gene_code, code="A")
     for row in query.rows():
         print(
             [
                 row["secondaryIdentifier"],
                 row["chromosome.primaryIdentifier"],
                 row["chromosomeLocation.start"],
                 row["chromosomeLocation.end"],
                 "+" if row["chromosomeLocation.strand"] else "-",
             ]
         )
         return [
             row["secondaryIdentifier"],
             row["chromosome.primaryIdentifier"][3:],
             row["chromosomeLocation.start"],
             row["chromosomeLocation.end"],
             "+" if row["chromosomeLocation.strand"] else "-",
         ]
Пример #12
0
def fetch_from_sgd() -> dict:
    """Query SGD's intermine service and return an up-to-date dict of S. Cerevisiae features (genes).
    Returned is a dictionary of "SGD_ID" -> dict of feature data. Keys in feature data are:
    sgd_id, feature_qualifier, feature_type, orf, name, aliases, chromosome, chromosomal_location, start_coordinate,
    stop_coordinate, description

    :rtype: dict
    """
    re_num = re.compile(r'(\d+)')

    service = Service("https://yeastmine.yeastgenome.org/yeastmine/service")

    query = service.new_query("Gene")
    query.add_view("primaryIdentifier", "featureType", "qualifier",
                   "secondaryIdentifier", "symbol", "chromosomeLocation.start",
                   "chromosomeLocation.end", "description", "synonyms.value")

    query.add_constraint("organism.shortName", "=", "S. cerevisiae", code="A")
    query.add_constraint("featureType", "=", "ORF", code="C")

    genes = {}

    logger.debug("Executing query on yeastmine")
    for row in query.rows():
        sgd_id = row["primaryIdentifier"]
        orf = row["secondaryIdentifier"]

        orfnum = re_num.findall(orf)
        if orfnum:
            orfnum = int(orfnum[0])
        else:
            orfnum = 0

        if orf.startswith('Q'):
            chrom = 0
        else:
            chrom = ord(orf[1]) - 64
            if orf[2] == 'L':
                orfnum = -orfnum

        if sgd_id not in genes:
            logger.debug(f"Parsing new ORF: {orf}")
            genes[sgd_id] = {
                'sgd_id': row["primaryIdentifier"],
                'feature_qualifier': row["qualifier"],
                'feature_type': row['featureType'],
                'orf': orf,
                'name': row["symbol"],
                'aliases': [],
                'chromosome': chrom,
                'chromosomal_location': orfnum,
                'start_coordinate': str(row["chromosomeLocation.start"]),
                'stop_coordinate': str(row["chromosomeLocation.end"]),
                'description': row["description"],
            }

        if row["synonyms.value"] not in (orf, row["symbol"]):
            genes[sgd_id]['aliases'].append(row["synonyms.value"])

    return genes
Пример #13
0
def get_gene_id(gene_name):
    """Retrieve systematic yeast gene name from the common name.

    :param gene_name: Common name for yeast gene (e.g. ADE2).
    :type gene_name: str
    :returns: Systematic name for yeast gene (e.g. YOR128C).
    :rtype: str

    """
    service = Service("http://yeastmine.yeastgenome.org/yeastmine/service")

    # Get a new query on the class (table) you will be querying:
    query = service.new_query("Gene")

    # The view specifies the output columns
    query.add_view("primaryIdentifier", "secondaryIdentifier", "symbol",
                   "name", "sgdAlias", "crossReferences.identifier",
                   "crossReferences.source.name")

    # Uncomment and edit the line below (the default) to select a custom sort
    # order:
    # query.add_sort_order("Gene.primaryIdentifier", "ASC")

    # You can edit the constraint values below
    query.add_constraint("organism.shortName", "=", "S. cerevisiae", code="B")
    query.add_constraint("Gene", "LOOKUP", gene_name, code="A")

    # Uncomment and edit the code below to specify your own custom logic:
    # query.set_logic("A and B")

    for row in query.rows():
        gid = row["secondaryIdentifier"]
    return gid
Пример #14
0
def query_fishmine(intermine_url: str, protein_id: str, query: str="Gene") -> IntermineResult:
    service = Service(intermine_url)
    query = service.new_query(query)
    query.add_view("primaryIdentifier")
    query.add_constraint("primaryIdentifier", "CONTAINS", "ZDB*", code="A")
    query.add_constraint("crossReferences.identifier", "=", "{}".format(protein_id), code="B")
    result_list = ["ZFIN:{}".format(val['primaryIdentifier']) for val in query.rows()]
    return intermine_response_factory(result_list, protein_id)
Пример #15
0
    def parse(self, limit=None):

        count = 0
        for num in range(10, 100):
            fuzzy_gene = "MGI:{0}*".format(num)
            gene = "MGI:{0}".format(num)
            service = Service("http://www.mousemine.org/mousemine/service")
            logging.getLogger('Model').setLevel(logging.CRITICAL)
            logging.getLogger('JSONIterator').setLevel(logging.CRITICAL)
            query = service.new_query("OntologyAnnotation")
            query.add_constraint("subject", "SequenceFeature")
            query.add_constraint("ontologyTerm", "MPTerm")
            query.add_view("subject.primaryIdentifier", "subject.symbol",
                           "subject.sequenceOntologyTerm.name",
                           "ontologyTerm.identifier", "ontologyTerm.name",
                           "evidence.publications.pubMedId",
                           "evidence.comments.type",
                           "evidence.comments.description")
            query.add_sort_order("OntologyAnnotation.ontologyTerm.name", "ASC")
            query.add_constraint("subject.organism.taxonId",
                                 "=",
                                 "10090",
                                 code="A")
            query.add_constraint("subject", "LOOKUP", fuzzy_gene, code="B")
            query.add_constraint("subject.primaryIdentifier",
                                 "CONTAINS",
                                 gene,
                                 code="C")
            query.outerjoin("evidence.comments")

            for row in query.rows():
                mgi_curie = row["subject.primaryIdentifier"]
                mp_curie = row["ontologyTerm.identifier"]
                pub_curie = "PMID:{0}".format(
                    row["evidence.publications.pubMedId"])
                assoc = G2PAssoc(self.graph, self.name, mgi_curie, mp_curie)
                if row["evidence.publications.pubMedId"]:
                    reference = Reference(
                        self.graph, pub_curie,
                        Reference.ref_types['journal_article'])
                    reference.addRefToGraph()
                    assoc.add_source(pub_curie)

                assoc.add_evidence('ECO:0000059')
                assoc.add_association_to_graph()

            if not count % 10 and count != 0:
                count_from = count - 10
                logger.info(
                    "{0} processed ids from MGI:{1}* to MGI:{2}*".format(
                        datetime.datetime.now(), count_from, count))

            count += 1
            if limit and count >= limit:
                break

        return
Пример #16
0
def get_yeast_gene_location(gene_name):
    '''Acquire the location of a gene from SGD http://www.yeastgenome.org
    :param gene_name: Name of the gene.
    :type gene_name: string
    :returns location: [int: chromosome, int:biostart, int:bioend, int:strand]
    :rtype location: list

    '''
    from intermine.webservice import Service
    service = Service('http://yeastmine.yeastgenome.org/yeastmine/service')

    # Get a new query on the class (table) you will be querying:
    query = service.new_query('Gene')

    # The view specifies the output columns
    query.add_view('primaryIdentifier', 'secondaryIdentifier', 'symbol',
                   'name', 'organism.shortName',
                   'chromosome.primaryIdentifier', 'chromosomeLocation.start',
                   'chromosomeLocation.end', 'chromosomeLocation.strand')

    # Uncomment and edit the line below (the default) to select a custom sort
    # order:
    # query.add_sort_order('Gene.primaryIdentifier', 'ASC')

    # You can edit the constraint values below
    query.add_constraint('organism.shortName', '=', 'S. cerevisiae', code='B')
    query.add_constraint('Gene', 'LOOKUP', gene_name, code='A')

    # Uncomment and edit the code below to specify your own custom logic:
    # query.set_logic('A and B')
    chromosomes = {
        'chrI': 1,
        'chrII': 2,
        'chrIII': 3,
        'chrIV': 4,
        'chrV': 5,
        'chrVI': 6,
        'chrVII': 7,
        'chrVIII': 8,
        'chrIX': 9,
        'chrX': 10,
        'chrXI': 11,
        'chrXII': 12,
        'chrXIII': 13,
        'chrXIV': 14,
        'chrXV': 15,
        'chrXVI': 16
    }
    first_result = query.rows().next()

    return [
        chromosomes[first_result['chromosome.primaryIdentifier']],
        first_result['chromosomeLocation.start'],
        first_result['chromosomeLocation.end'],
        int(first_result['chromosomeLocation.strand'])
    ]
Пример #17
0
    def download(self, genes, fields, scope=None, species=None):
        '''
        Retrives the data depending on self.constraints and self.view
        '''
        constraints = self.constraints
        views = self.views
        glist = np.array(genes)
        if len(glist) > 1000:
            a = len(glist) / 1000
            segs = np.array_split(glist, a)
        else:
            segs = [glist]

        # store the data in here
        z = []

        # API uses letters to distinguish between constraints
        alpha = list(string.ascii_uppercase)

        for seg in segs:
            # Connect to the API
            service = SS(self.datasource)
            query = service.new_query("Gene")
            query.add_view(",".join(views))
            # Some databases require a host name
            if self.hostid != "":
                query.add_constraint("Gene",
                                     "LOOKUP",
                                     ",".join(seg),
                                     self.hostid,
                                     code="A")
            else:
                query.add_constraint("Gene", "LOOKUP", ",".join(seg), code="A")

            # Apply the constraints
            if len(constraints) != 0:
                i = 1
                for constraint in constraints:
                    letter = alpha[i]
                    if len(constraint.split("=")) == 2:
                        L = constraint.split("=")
                        query.add_constraint(L[0], "=", L[1], code=letter)
                    elif re.search("IS NOT NULL", constraint):
                        p1 = constraint.replace(" IS NOT NULL", "")
                        query.add_constraint(p1, "IS NOT NULL", code=letter)
                    i = i + 1

            # Parse the output into a list of tuples
            j = 0
            for row in query.rows():
                t = [row['symbol']]
                for v in views:
                    t.append(row[v])
                z.append(tuple(t))
                j += 1
        self.dataset = z
Пример #18
0
def find_max_data_items(new_list, intermine, intermine_url):

    service = Service(intermine_url + "/service")
    max = 0
    for i in new_list:
        query = service.new_query(i)
        query.add_view(i + ".*")
        if (query.count() >= max):
            max = query.count()
    return max
Пример #19
0
def query(ids):
    service = Service("http://targetmine.nibio.go.jp/targetmine")
    query = service.new_query("Protein")
    query.add_view("primaryIdentifier", "primaryAccession", "name", "length",
                   "compounds.compound.casRegistryNumber",
                   "compounds.compound.name",
                   "compounds.compound.compoundGroup.name")
    test_id = ids[0]
    query.add_constraint("Protein", "IN", ",".join(ids))
    return query.rows()
Пример #20
0
def main():

    if not os.path.exists("results"):
        os.makedirs("results")

    service = Service("https://apps.araport.org/thalemine/service")
    file = open("results/all_genes.csv", "w")
    list_written = []
    list_genes = []
    list_gene_names = []
    for index, line in enumerate(open(os.getcwd() + "/" + sys.argv[1])):
        gene = line.strip()
        query = service.new_query("Gene")
        query.add_view("primaryIdentifier",
                       "RNASeqExpressions.expressionLevel",
                       "RNASeqExpressions.experiment.SRAaccession",
                       "RNASeqExpressions.experiment.tissue",
                       "RNASeqExpressions.unit")
        query.add_sort_order("Gene.RNASeqExpressions.experiment.SRAaccession",
                             "DESC")
        query.add_constraint("primaryIdentifier", "=", gene, code="A")

        for row in query.rows():
            experiment_tissue = str(
                row["RNASeqExpressions.experiment.SRAaccession"]) + "-" + str(
                    row["RNASeqExpressions.experiment.tissue"])
            expression_value = str(row["RNASeqExpressions.expressionLevel"])
            if experiment_tissue not in list_written:
                list_written.append(experiment_tissue)
            list_genes.append((gene, experiment_tissue, expression_value))

        list_gene_names.append(gene)

    for item in list_written:
        file.write("\t" + item)

    file.write("\n")

    flag = 0
    for gene_name in list_gene_names:
        file.write(gene_name)
        for item in list_written:
            flag = 0
            for gene_name_temp, exp_tissue, expression_value in list_genes:
                if gene_name == gene_name_temp:
                    if item == exp_tissue:
                        file.write("\t" + expression_value)
                        flag = 1
                        break
            if flag == 0:
                file.write("\t0")

        file.write("\n")

    file.close()
Пример #21
0
def get_yeast_gene_location(gene_name):
    """Acquire the location of a gene from SGD http://www.yeastgenome.org
    :param gene_name: Name of the gene.
    :type gene_name: string
    :returns location: [int: chromosome, int:biostart, int:bioend, int:strand]
    :rtype location: list

    """
    service = Service("http://yeastmine.yeastgenome.org/yeastmine/service")

    # Get a new query on the class (table) you will be querying:
    query = service.new_query("Gene")

    # The view specifies the output columns
    query.add_view("primaryIdentifier", "secondaryIdentifier", "symbol",
                   "name", "organism.shortName",
                   "chromosome.primaryIdentifier", "chromosomeLocation.start",
                   "chromosomeLocation.end", "chromosomeLocation.strand")

    # Uncomment and edit the line below (the default) to select a custom sort
    # order:
    # query.add_sort_order("Gene.primaryIdentifier", "ASC")

    # You can edit the constraint values below
    query.add_constraint("organism.shortName", "=", "S. cerevisiae", code="B")
    query.add_constraint("Gene", "LOOKUP", gene_name, code="A")

    # Uncomment and edit the code below to specify your own custom logic:
    # query.set_logic("A and B")
    chromosomes = {
        "chrI": 1,
        "chrII": 2,
        "chrIII": 3,
        "chrIV": 4,
        "chrV": 5,
        "chrVI": 6,
        "chrVII": 7,
        "chrVIII": 8,
        "chrIX": 9,
        "chrX": 10,
        "chrXI": 11,
        "chrXII": 12,
        "chrXIII": 13,
        "chrXIV": 14,
        "chrXV": 15,
        "chrXVI": 16
    }
    first_result = query.rows().next()

    return [
        chromosomes[first_result["chromosome.primaryIdentifier"]],
        first_result["chromosomeLocation.start"],
        first_result["chromosomeLocation.end"],
        int(first_result["chromosomeLocation.strand"])
    ]
Пример #22
0
def get_yeast_gene_location(gene_name):
    '''Acquire the location of a gene from SGD http://www.yeastgenome.org
    :param gene_name: Name of the gene.
    :type gene_name: string
    :returns location: [int: chromosome, int:biostart, int:bioend, int:strand]
    :rtype location: list

    '''
    from intermine.webservice import Service
    service = Service('http://yeastmine.yeastgenome.org/yeastmine/service')

    # Get a new query on the class (table) you will be querying:
    query = service.new_query('Gene')

    # The view specifies the output columns
    query.add_view('primaryIdentifier', 'secondaryIdentifier', 'symbol',
                   'name', 'organism.shortName',
                   'chromosome.primaryIdentifier',
                   'chromosomeLocation.start', 'chromosomeLocation.end',
                   'chromosomeLocation.strand')

    # Uncomment and edit the line below (the default) to select a custom sort
    # order:
    # query.add_sort_order('Gene.primaryIdentifier', 'ASC')

    # You can edit the constraint values below
    query.add_constraint('organism.shortName', '=', 'S. cerevisiae',
                         code='B')
    query.add_constraint('Gene', 'LOOKUP', gene_name, code='A')

    # Uncomment and edit the code below to specify your own custom logic:
    # query.set_logic('A and B')
    chromosomes = {'chrI': 1,
                   'chrII': 2,
                   'chrIII': 3,
                   'chrIV': 4,
                   'chrV': 5,
                   'chrVI': 6,
                   'chrVII': 7,
                   'chrVIII': 8,
                   'chrIX': 9,
                   'chrX': 10,
                   'chrXI': 11,
                   'chrXII': 12,
                   'chrXIII': 13,
                   'chrXIV': 14,
                   'chrXV': 15,
                   'chrXVI': 16}
    first_result = query.rows().next()

    return [chromosomes[first_result['chromosome.primaryIdentifier']],
            first_result['chromosomeLocation.start'],
            first_result['chromosomeLocation.end'],
            int(first_result['chromosomeLocation.strand'])]
Пример #23
0
def query(ids):
    service = Service("http://targetmine.nibio.go.jp/targetmine")
    query = service.new_query("Protein")
    query.add_view(
        "primaryIdentifier", "primaryAccession", "name", "length",
        "compounds.compound.casRegistryNumber", "compounds.compound.name",
        "compounds.compound.compoundGroup.name"
    )
    test_id = ids[0]
    query.add_constraint("Protein", "IN", ",".join(ids))
    return query.rows()
Пример #24
0
    def download(self, genes, fields, scope=None, species=None):
        '''
        Retrives the data depending on self.constraints and self.view
        '''
        constraints = self.constraints
        views = self.views
        glist = np.array(genes)
        if len(glist) > 1000:
            a = len(glist) / 1000
            segs = np.array_split(glist, a)
        else:
            segs = [glist]

        # store the data in here
        z = []

        # API uses letters to distinguish between constraints
        alpha = list(string.ascii_uppercase)

        for seg in segs:
            # Connect to the API
            service = SS(self.datasource)
            query = service.new_query("Gene")
            query.add_view(",".join(views))
            # Some databases require a host name
            if self.hostid != "":
                query.add_constraint("Gene", "LOOKUP", ",".join(seg),
                                     self.hostid, code="A")
            else:
                query.add_constraint("Gene", "LOOKUP", ",".join(seg), code="A")

            # Apply the constraints
            if len(constraints) != 0:
                i = 1
                for constraint in constraints:
                    letter = alpha[i]
                    if len(constraint.split("=")) == 2:
                        L = constraint.split("=")
                        query.add_constraint(L[0], "=", L[1], code=letter)
                    elif re.search("IS NOT NULL", constraint):
                        p1 = constraint.replace(" IS NOT NULL", "")
                        query.add_constraint(p1, "IS NOT NULL", code=letter)
                    i = i + 1

            # Parse the output into a list of tuples
            j = 0
            for row in query.rows():
                t = [row['symbol']]
                for v in views:
                    t.append(row[v])
                z.append(tuple(t))
                j += 1
        self.dataset = z
Пример #25
0
def get_yeast_gene_location(gene_name):
    """Acquire the location of a gene from SGD http://www.yeastgenome.org
    :param gene_name: Name of the gene.
    :type gene_name: string
    :returns location: [int: chromosome, int:biostart, int:bioend, int:strand]
    :rtype location: list

    """
    service = Service("http://yeastmine.yeastgenome.org/yeastmine/service")

    # Get a new query on the class (table) you will be querying:
    query = service.new_query("Gene")

    # The view specifies the output columns
    query.add_view("primaryIdentifier", "secondaryIdentifier", "symbol",
                   "name", "organism.shortName",
                   "chromosome.primaryIdentifier",
                   "chromosomeLocation.start", "chromosomeLocation.end",
                   "chromosomeLocation.strand")

    # Uncomment and edit the line below (the default) to select a custom sort
    # order:
    # query.add_sort_order("Gene.primaryIdentifier", "ASC")

    # You can edit the constraint values below
    query.add_constraint("organism.shortName", "=", "S. cerevisiae",
                         code="B")
    query.add_constraint("Gene", "LOOKUP", gene_name, code="A")

    # Uncomment and edit the code below to specify your own custom logic:
    # query.set_logic("A and B")
    chromosomes = {"chrI": 1,
                   "chrII": 2,
                   "chrIII": 3,
                   "chrIV": 4,
                   "chrV": 5,
                   "chrVI": 6,
                   "chrVII": 7,
                   "chrVIII": 8,
                   "chrIX": 9,
                   "chrX": 10,
                   "chrXI": 11,
                   "chrXII": 12,
                   "chrXIII": 13,
                   "chrXIV": 14,
                   "chrXV": 15,
                   "chrXVI": 16}
    first_result = query.rows().next()

    return [chromosomes[first_result["chromosome.primaryIdentifier"]],
            first_result["chromosomeLocation.start"],
            first_result["chromosomeLocation.end"],
            int(first_result["chromosomeLocation.strand"])]
def intermine_query(type):
    from intermine.webservice import Service
    service = Service("http://yeastmine.yeastgenome.org/yeastmine/service")

    # Get a new query on the class (table) you will be querying:
    query = service.new_query(type)

    # The view specifies the output columns
    query.add_view("primaryIdentifier", "sequence.residues")

    # Uncomment and edit the line below (the default) to select a custom sort order:
    # query.add_sort_order("Chromosome.primaryIdentifier", "ASC")

    return query
Пример #27
0
    def parse(self, limit=None):

        count = 0
        for num in range(10, 100):
            fuzzy_gene = "MGI:{0}*".format(num)
            gene = "MGI:{0}".format(num)
            service = Service("http://www.mousemine.org/mousemine/service")
            logging.getLogger('Model').setLevel(logging.ERROR)
            logging.getLogger('JSONIterator').setLevel(logging.ERROR)
            query = service.new_query("OntologyAnnotation")
            query.add_constraint("subject", "SequenceFeature")
            query.add_constraint("ontologyTerm", "MPTerm")
            query.add_view(
                "subject.primaryIdentifier", "subject.symbol",
                "subject.sequenceOntologyTerm.name", "ontologyTerm.identifier",
                "ontologyTerm.name", "evidence.publications.pubMedId",
                "evidence.comments.type", "evidence.comments.description"
            )
            query.add_sort_order("OntologyAnnotation.ontologyTerm.name", "ASC")
            query.add_constraint("subject.organism.taxonId", "=", self.txid, code="A")
            query.add_constraint("subject", "LOOKUP", fuzzy_gene, code="B")
            query.add_constraint(
                "subject.primaryIdentifier", "CONTAINS", gene, code="C")
            query.outerjoin("evidence.comments")

            for row in query.rows():
                mgi_curie = row["subject.primaryIdentifier"]
                mp_curie = row["ontologyTerm.identifier"]
                pub_curie = "PMID:{0}".format(row["evidence.publications.pubMedId"])
                assoc = G2PAssoc(self.graph, self.name, mgi_curie, mp_curie)
                if row["evidence.publications.pubMedId"]:
                    reference = Reference(
                        self.graph, pub_curie, self.globaltt['journal article'])
                    reference.addRefToGraph()
                    assoc.add_source(pub_curie)

                assoc.add_evidence(self.globaltt['experimental phenotypic evidence'])
                assoc.add_association_to_graph()

            if not count % 10 and count != 0:
                count_from = count - 10
                LOG.info(
                    "%s processed ids from MGI:%i* to MGI:%i*",
                    datetime.datetime.now(), count_from, count)

            count += 1
            if limit and count >= limit:
                break

        return
Пример #28
0
def query_mousemine(intermine_url: str, gene_id: str) -> IntermineResult:
    """
    :param intermine_url: intermine server, eg
                          http://www.mousemine.org/mousemine/service
    :param gene_id: gene ID, eg ENSMUSG00000063180
    :return: Intermine_Result object
    """
    service = Service(intermine_url)
    query = service.new_query("SequenceFeature")
    query.add_view("primaryIdentifier")
    query.add_constraint("SequenceFeature", "LOOKUP", "{}".format(gene_id), code="A")
    query.add_constraint("organism.shortName", "=", "M. musculus", code="B")
    result_list = ["{}".format(val['primaryIdentifier']) for val in query.rows()]
    return intermine_response_factory(result_list, gene_id)
Пример #29
0
def query_fishmine(intermine_url: str,
                   protein_id: str,
                   query: str = "Gene") -> IntermineResult:
    service = Service(intermine_url)
    query = service.new_query(query)
    query.add_view("primaryIdentifier")
    query.add_constraint("primaryIdentifier", "CONTAINS", "ZDB*", code="A")
    query.add_constraint("crossReferences.identifier",
                         "=",
                         "{}".format(protein_id),
                         code="B")
    result_list = [
        "ZFIN:{}".format(val['primaryIdentifier']) for val in query.rows()
    ]
    return intermine_response_factory(result_list, protein_id)
Пример #30
0
 def test(self):
     '''
     Tests the HumanMine API
     Look up symbol for APOBEC3G, should return APOBEC3G.
     '''
     service = SS('http://www.humanmine.org/humanmine/service')
     query = service.new_query("Gene")
     query.add_view("symbol")
     query.add_constraint("Gene", "LOOKUP", "APOBEC3G", code="A")
     for row in query.rows():
         symbol = row['symbol']
     if symbol == "APOBEC3G":
         return 1
     else:
         return 0
Пример #31
0
 def test(self):
     '''
     Tests the HumanMine API
     Look up symbol for APOBEC3G, should return APOBEC3G.
     '''
     service = SS('http://www.humanmine.org/humanmine/service')
     query = service.new_query("Gene")
     query.add_view("symbol")
     query.add_constraint("Gene", "LOOKUP", "APOBEC3G", code="A")
     for row in query.rows():
         symbol = row['symbol']
     if symbol == "APOBEC3G":
         return 1
     else:
         return 0
Пример #32
0
def query_intermine(genes):
    genes = ', '.join(genes)
    from intermine.webservice import Service
    service = Service("http://www.mousemine.org/mousemine/service")
    query = service.new_query("OntologyAnnotation")
    query.add_constraint("ontologyTerm", "MPTerm")
    query.add_constraint("subject", "SequenceFeature")
    query.add_view("subject.primaryIdentifier", "subject.symbol",
                   "subject.sequenceOntologyTerm.name",
                   "ontologyTerm.identifier", "ontologyTerm.name",
                   "evidence.publications.pubMedId", "evidence.comments.type",
                   "evidence.comments.description")
    query.add_sort_order("OntologyAnnotation.ontologyTerm.name", "ASC")
    query.add_constraint("subject.organism.taxonId", "=", "10090", code="A")
    query.add_constraint("subject", "LOOKUP", genes, code="B")
    query.outerjoin("evidence.comments")
    return query
Пример #33
0
def query_mousemine(intermine_url: str, gene_id: str) -> IntermineResult:
    """
    :param intermine_url: intermine server, eg
                          http://www.mousemine.org/mousemine/service
    :param gene_id: gene ID, eg ENSMUSG00000063180
    :return: Intermine_Result object
    """
    service = Service(intermine_url)
    query = service.new_query("SequenceFeature")
    query.add_view("primaryIdentifier")
    query.add_constraint("SequenceFeature",
                         "LOOKUP",
                         "{}".format(gene_id),
                         code="A")
    query.add_constraint("organism.shortName", "=", "M. musculus", code="B")
    result_list = [
        "{}".format(val['primaryIdentifier']) for val in query.rows()
    ]
    return intermine_response_factory(result_list, gene_id)
Пример #34
0
def query_mousemine_to_create_mouse_het_lethal_knockout_genes():
    service = Service("http://www.mousemine.org/mousemine/service")
    query = service.new_query("OntologyAnnotation")
    query.add_constraint("ontologyTerm", "MPTerm")
    query.add_constraint("subject", "SequenceFeature")
    query.add_constraint("evidence.baseAnnotations.subject", "Genotype")
    query.add_view("subject.primaryIdentifier", "subject.symbol",
                   "evidence.baseAnnotations.subject.symbol",
                   "evidence.baseAnnotations.subject.background.name",
                   "evidence.baseAnnotations.subject.zygosity",
                   "ontologyTerm.identifier", "ontologyTerm.name")
    query.add_sort_order("OntologyAnnotation.subject.symbol", "ASC")
    query.add_constraint("evidence.baseAnnotations.subject.zygosity",
                         "=",
                         "ht",
                         code="B")
    query.add_constraint("ontologyTerm.name", "CONTAINS", "lethal", code="A")

    headers = [
        'Subject Primary Identifier',
        'Ontology Annotation Subject . Symbol',
        'Base Annotations Subject . Symbol',
        'Subject Background',
        'Subject Zygosity',
        'Ontology Annotation Ontology Term . Identifier',
        'Ontology Annotation Term Name',
    ]
    table = [headers]

    for query_row in query.rows():
        row = [
            query_row["subject.primaryIdentifier"],
            query_row["subject.symbol"],
            query_row["evidence.baseAnnotations.subject.symbol"],
            query_row["evidence.baseAnnotations.subject.background.name"],
            query_row["evidence.baseAnnotations.subject.zygosity"],
            query_row["ontologyTerm.identifier"],
            query_row["ontologyTerm.name"],
        ]
        table.append(row)

    output_csv = SOURCE_DATA_FOLDER + 'mouse_het_lethal.tsv'
    write_table_to_csv(table, output_csv, delimiter='\t')
Пример #35
0
def getData(mine):
    """
    A function to get datasets corresponding to a mine
    ================================================
    example:

        >>> from intermine import registry
        >>> registry.getData('flymine')
        Name: Affymetrix array: Drosophila1
        Name: Affymetrix array: Drosophila2
        Name: Affymetrix array: GeneChip Drosophila Genome 2.0 Array
        Name: Affymetrix array: GeneChip Drosophila Genome Array
        Name: Anoph-Expr data set
        Name: BDGP cDNA clone data set.....


    """
    x = "http://registry.intermine.org/service/instances/" + mine
    try:
        r = requests.get(x)
        dict = json.loads(r.text)
        link = dict["instance"]["url"]
        service = Service(link)
        query = service.new_query("DataSet")
        query.add_view("name", "url")
        list = []

        for row in query.rows():
            try:
                list.append(row["name"])

            except KeyError:
                print("No info available")
        list.sort()
        for i in range(len(list)):
            print("Name: " + list[i])
        return None
    except KeyError:
        return "No such mine available"
Пример #36
0
    def mine(self, category):
        locus_id = self.cleaned_data['locus_id']
        locus_id = locus_id.split('\r\n')
        locus_id = {'id': locus_id}
        df = pd.DataFrame(locus_id)

        from intermine.webservice import Service
        service = Service("https://phytozome.jgi.doe.gov/phytomine/service")

        query = service.new_query(category)

        query.add_view("name", "primaryIdentifier", "secondaryIdentifier",
                       "sequence.residues")

        output = ''
        for i in df['id']:
            query.add_constraint("name", "=", i, code="D")
            for row in query.rows():
                output = output + (">"+str(row["primaryIdentifier"])+'\n'+\
                                    row["sequence.residues"]+'\n')

        return output
Пример #37
0
def getData(mine):
    """
    A function to get datasets corresponding to a mine
    ================================================
    example:

        >>> from intermine import registry
        >>> registry.getData('flymine')
        Name: Affymetrix array: Drosophila1
        Name: Affymetrix array: Drosophila2
        Name: Affymetrix array: GeneChip Drosophila Genome 2.0 Array
        Name: Affymetrix array: GeneChip Drosophila Genome Array
        Name: Anoph-Expr data set
        Name: BDGP cDNA clone data set.....


    """
    x = "http://registry.intermine.org/service/instances/" + mine
    try:
        r = requests.get(x)
        dict = json.loads(r.text)
        link = dict["instance"]["url"]
        service = Service(link)
        query = service.new_query("DataSet")
        query.add_view("name", "url")
        list = []

        for row in query.rows():
            try:
                list.append(row["name"])

            except KeyError:
                print("No info available")
        list.sort()
        for i in range(len(list)):
            print("Name: " + list[i])
        return None
    except KeyError:
        return "No such mine available"
Пример #38
0
    def mine(self):
        promoters = self.cleaned_data['promoter']
        size = self.cleaned_data['size']
        promoters = promoters.split('\r\n')
        promoters = {'id': promoters}
        df = pd.DataFrame(promoters)

        from intermine.webservice import Service
        service = Service("https://phytozome.jgi.doe.gov/phytomine/service")

        query = service.new_query("Gene")

        query.add_view("name", "primaryIdentifier", "secondaryIdentifier",
                       "length", "flankingRegions.length",
                       "flankingRegions.includeGene",
                       "flankingRegions.direction",
                       "flankingRegions.primaryIdentifier",
                       "flankingRegions.sequence.length",
                       "flankingRegions.sequence.residues")

        query.add_constraint("flankingRegions.length", "=", "5000", code="A")
        query.add_constraint("flankingRegions.includeGene",
                             "=",
                             "false",
                             code="B")
        query.add_constraint("flankingRegions.direction",
                             "=",
                             "upstream",
                             code="C")

        output = ''
        for i in df['id']:
            query.add_constraint("name", "=", i, code="D")
            for row in query.rows():
                output = output + (">"+str(row["primaryIdentifier"])+'\n'+\
                                    str(row["flankingRegions.sequence.residues"][5000-size:])+'\n')

        return output
Пример #39
0
        with open("three.seq", "w") as seq_file_three:
            three_constant_type_A = 'GTAATTAACCCTCACTAAAGGG'
            three_constant_type_B = 'CTAATTAACCCTCACTAAAGGG'
            three_end = data_set[1].replace(three_constant_type_A, '').replace(
                three_constant_type_B, '')
            seq = Seq(three_end)
            data_set[1] = str(seq.reverse_complement())
            seq_file_three.write(data_set[1])

        # Write 5' probe to five.seq
        with open("five.seq", "w") as seq_file_five:
            seq_file_five.write(data_set[2])

        # Need to fetch the transcript sequences for the respective gene from FlyMine:
        service = Service("http://www.flymine.org/flymine/service")
        query = service.new_query("Gene")
        query.add_view("transcripts.primaryIdentifier", "transcripts.length")
        query.add_sort_order("transcripts.length", "DESC")
        query.add_constraint("Gene", "LOOKUP", data_set[0], code="A")
        transcript_primary_identifiers = []

        # For each transcript found for a given gene:
        for row in query.rows():
            print("")
            print("TRANSCRIPT ID: ", row["transcripts.primaryIdentifier"],
                  "||", "TRANSCRIPT LENGTH: ", row["transcripts.length"])
            transcript_primary_identifiers.append(
                row["transcripts.primaryIdentifier"])

        print("")
        print("")
Пример #40
0
def fetch_yeast_locus_sequence(locus_name, flanking_size=0):
    '''Acquire a sequence from SGD http://www.yeastgenome.org.

    :param locus_name: Common name or systematic name for the locus (e.g. ACT1
                       or YFL039C).
    :type locus_name: str
    :param flanking_size: The length of flanking DNA (on each side) to return
    :type flanking_size: int

    '''
    from intermine.webservice import Service

    service = Service('http://yeastmine.yeastgenome.org/yeastmine/service')

    # Get a new query on the class (table) you will be querying:
    query = service.new_query('Gene')

    if flanking_size > 0:

        # The view specifies the output columns
        # secondaryIdentifier: the systematic name (e.g. YFL039C)
        # symbol: short name (e.g. ACT1)
        # length: sequence length
        # flankingRegions.direction: Upstream or downstream (or both) of locus
        # flankingRegions.sequence.length: length of the flanking regions
        # flankingRegions.sequence.residues: sequence of the flanking regions
        query.add_view('secondaryIdentifier', 'symbol', 'length',
                       'flankingRegions.direction',
                       'flankingRegions.sequence.length',
                       'flankingRegions.sequence.residues')

        # You can edit the constraint values below
        query.add_constraint('flankingRegions.direction', '=', 'both',
                             code='A')
        query.add_constraint('Gene', 'LOOKUP', locus_name, 'S. cerevisiae',
                             code='B')
        query.add_constraint('flankingRegions.distance', '=',
                             '{:.1f}kb'.format(flanking_size / 1000.),
                             code='C')
        # Uncomment and edit the code below to specify your own custom logic:
        query.set_logic('A and B and C')

        # TODO: What to do when there's more than one result?
        first_result = query.rows().next()
        # FIXME: Use logger module instead
        # print first_result['secondaryIdentifier']
        # print first_result['symbol'], row['length']
        # print first_result['flankingRegions.direction']
        # print first_result['flankingRegions.sequence.length']
        # print first_result['flankingRegions.sequence.residues']

        seq = coral.DNA(first_result['flankingRegions.sequence.residues'])
        # TODO: add more metadata

    elif flanking_size == 0:
        # The view specifies the output columns
        query.add_view('primaryIdentifier', 'secondaryIdentifier', 'symbol',
                       'name', 'sgdAlias', 'organism.shortName',
                       'sequence.length', 'sequence.residues', 'description',
                       'qualifier')

        query.add_constraint('status', 'IS NULL', code='D')
        query.add_constraint('status', '=', 'Active', code='C')
        query.add_constraint('qualifier', 'IS NULL', code='B')
        query.add_constraint('qualifier', '!=', 'Dubious', code='A')
        query.add_constraint('Gene', 'LOOKUP', locus_name, 'S. cerevisiae',
                             code='E')

        # Your custom constraint logic is specified with the code below:
        query.set_logic('(A or B) and (C or D) and E')

        first_result = query.rows().next()
        seq = coral.DNA(first_result['sequence.residues'])
    else:
        print 'Problem with the flanking region size....'
        seq = coral.DNA('')

    return seq
else:

    # To run your query
    # to use it you will require the intermine python client.
    # To install the client, run the following command from a terminal:
    #
    #     sudo easy_install intermine
    #
    # For further documentation you can visit:
    #     http://intermine.readthedocs.org/en/latest/web-services/

    # The following two lines will be needed in every python script:
    from intermine.webservice import Service
    service = Service("http://yeastmine.yeastgenome.org/yeastmine/service")
    query = service.new_query("SequenceFeature")
    query.add_view(
        "primaryIdentifier", "featureType", "secondaryIdentifier", "description",
        "sgdAlias", "symbol"
    )
    query.add_constraint("featureType", "=", "telomerase_RNA_gene", code = "Z")
    query.add_constraint("qualifier", "IS NULL", code = "W")
    query.add_constraint("qualifier", "!=", "Dubious", code = "V")
    query.add_constraint("status", "=", "Active", code = "U")
    query.add_constraint("featureType", "=", "transposable_element_gene", code = "S")
    query.add_constraint("featureType", "=", "telomeric_repeat", code = "R")
    query.add_constraint("featureType", "=", "telomere", code = "Q")
    query.add_constraint("featureType", "=", "tRNA_gene", code = "P")
    query.add_constraint("featureType", "=", "snoRNA_gene", code = "O")
    query.add_constraint("featureType", "=", "snRNA_gene", code = "N")
    query.add_constraint("featureType", "=", "LTR_retrotransposon", code = "M")
Пример #42
0
def find_forkhead(chrom_ident, pattern):
    from intermine.webservice import Service
    service = Service("http://yeastmine.yeastgenome.org/yeastmine/service")

    # Get a new query on the class (table) you will be querying:
    query = service.new_query("Chromosome")

    # The view specifies the output columns
    query.add_view("primaryIdentifier", "sequence.residues")

    # Uncomment and edit the line below (the default) to select a custom sort order:
    # query.add_sort_order("Chromosome.primaryIdentifier", "ASC")

    query.add_constraint("Chromosome.primaryIdentifier", "=", chrom_ident)

    # Return one result and raise error if there is more than one
    query.one()

    chromosome = ''
    for row in query.rows():
        chromosome = row["sequence.residues"]

    def rev_comp(dna):
        comp = ''
        for nucl in dna:
            if nucl == 'A':
                comp = comp + 'T'
            elif nucl == 'T':
                comp = comp + 'A'
            elif nucl == 'C':
                comp = comp + 'G'
            elif nucl == 'G':
                comp = comp + 'C'
            else:
                print('Not a DNA sequence')

        rev_comp = comp[::-1]
        return rev_comp

    chromosome_rev = rev_comp(chromosome)

    import regex
    watson_matches = regex.finditer(pattern["pattern"], chromosome)
    crick_matches = regex.finditer(pattern["pattern"], chromosome_rev)
    all_matches = [match.group(0) for match in watson_matches]
    all_matches += [rev_comp(match.group(0)) for match in crick_matches]
    all_matches = list(set(all_matches))

    fkh_motif_coords = []
    for match in all_matches:
        matches = regex.finditer(match, chromosome)
        match_starts = [match.start() for match in matches]
        fkh_motif_coords += match_starts

    fkh_motif_coords.sort()
    print("Pattern {} matches: {}".format(pattern["name"], fkh_motif_coords))
    file_name = "output_data\{}_fkh_motifs.csv".format(chrom_ident)

    if len(fkh_motif_coords) > 0:
        with open(file_name, 'a') as f:
            for position in fkh_motif_coords:
                f.write("{},{},{}\n".format(chrom_ident, position, pattern["name"]))
Пример #43
0
gene_name_list = args.gene_name_list

# read the taxon ids
taxon_ids = {}
with open('data/phytozome_species_ids.txt') as csv_file:
    t_ids = csv.reader(csv_file, delimiter=',')
    for row in t_ids:
        taxon_ids[row[0]] = row[1]

# read the gene names
with open(gene_name_list, 'r') as f:
    gene_names = [line.strip() for line in f]

# run the Phytomine query
service = Service('https://phytozome.jgi.doe.gov/phytomine/service')
query = service.new_query('Gene')
views = ["length", "organism.taxonId", "primaryIdentifier",
         "organism.shortName",
         "ontologyAnnotations.ontologyTerm.identifier",
         "ontologyAnnotations.ontologyTerm.name",
         "ontologyAnnotations.ontologyTerm.namespace"]
query.add_view(views)
query.add_constraint('primaryIdentifier', 'ONE OF', gene_names, code='A')
query.add_constraint('organism.taxonId', 'ONE OF',
                     list(taxon_ids.values()), code='B')
query.set_logic("A and B")

# write results
tmp = tempfile.mkstemp(suffix=".txt", text=True)[1]
sep = "\t"
with open(tmp, 'w') as outfile:
Пример #44
0
from intermine.webservice import Service

service = Service("http://yeastmine.yeastgenome.org/yeastmine/service")

# Get a new query on the class (table) you will be querying:
query = service.new_query("Chromosome")

# The view specifies the output columns
query.add_view("primaryIdentifier", "sequence.residues")

# Uncomment and edit the line below (the default) to select a custom sort order:
# query.add_sort_order("Chromosome.primaryIdentifier", "ASC")
# chrIII = 'GATTACAGGGAATTTGTTTAATAGCAATTTATACGCTTTGTTATCGGCACCACCAAATTCTGGGATAACCGTTAATTCTTCCTCAGGTTTGCCTAGTGGATCCTCTCCTTCTGGAGTTTGGCCACGCTCTGGCTTTTCGATCAGACTTGGCATGTGACTAATCAAGTATGGCATGCTGGTTTTTGGGTCCTTTGTTTTCGTTGTTTCAGTCTGGATAAATTTTAAGTTACCATTATCGAAGGCACTTTTGTACTTGTCACTAATTAAAGATGCAATGTCAGCGGGGATACTCATTTTTATTTTAATTTTTACTTTTCTGTTTGTTCTAAAATCTATCTAAACTGGCTTTCAAGATCAATCTATTGTCTTTTAAGGTAAACTTTAAATTGGAAATAATAGTAATGTTAGTTCCTTCATTTTAACCTTGTATTGTATTTCCTTTGCGTGATGAAAAAAAAACTGAAAAAGAGAAAAATAAGAAAATCTTCTAGAACGTTCCGAAACAGGACACTTAGCACACAAATACAGAATAGGAAAGTAAAAGGCAATATATGAATGCAGTGCTTGTAACTGGTGCTTGTATCCAAGAATAGCTTCTTGCTGTAGGTTATGGGAATATCGTGTAAGCTGGGGTGACTTTTGAGCTATTCGCGACGCCCGACGCCGTAATAACTACTTTCGACAGACCACTTATGACAGTATTTCAGGCCGCTCTTATAAAATGACATGTTAACAAACAGTTCTGATTATTCGCCTTTTGACAGGACGATAATGTAAATAGTTGTGGTAGTATCATTCAGGTATGTAACTGTTTACTTTGTATCGCTTGAAAAAAATAAGCATTTCAGAGCCTTCTTTGGAGCTCAAGTGGATTGAGGCCACAGCAAGACCGGCCAGTTTGAATGCTCAACTCTTCAAAAGAAATTCCTCAAATATGTCCAGTTTCATGTACTGTCCGGTGTGATTTATTATTTTTTATTTACTTTGTAGTTCTTAAAGCTAAGATTTTTTTCTTTGATAAATTCTTGTTTTCATATCCTAAAATTAAAGGGAAAATAAACAATACATAACAAAACATATAAAAACCAACACAATAAAAAAAAGGATCAAATACTCATTAAAGTAACTTACACGGGGGCTAAAAACGGAGTTTGATGAATATTCACAAGATAAAAATCATATGTATGTTTCTGATATATCGATATACAATCAAACACTTTCAAGAATTTGTTTGTAGACTTTTTGCTAGAGACCTCATCAAAGTGCTACCAACTAAGATCAACTTATACTTCTTTTAGAGAAAATTTTTTTCAATGTACTCCAAAGAGATTTAGATCCTGTCTCTTCCTCTTCCTCTTCCTCGAAAGTCAAAGAAAAATCAGAGTCTCCCTGCTTATTCAGGCGGAGAGGCTCTAGGGTAGTTGCGTTTCTCTCATTGGGACACTGAACCTCATTTTCCAACATTTTGGTCATGTAAGAGGCGACAGGCTCATCGCAGGTGGGTGCATCAACATGGTAGTACCTGGACCAAGCGCTACATTGAGTCCCTCCTGGATAAACACCGCTACAATATTGTCTTTGGACGTTTGCCCAAACCATATCTTTTGAATACCAAAGCTGGACCACATTGTATGGCCTAATCATTGGTGCTACCATAATACTGGATTGGGAAACAGTCTGGTTAATTTTTTTCAACCAATTTTTCTTATCTAGCAATGATTTAATAAACCTGAAATCTAAATTGTCTTCGTTAGCGTCTGTGTCATAATCTACAATTGAGTACTGTGACGTCCAATTATATGGCACCGAGATGGGGAATCTGTCCGGTGTTTCGTCGCTGTTATCCTTCTCCTCCCTCCAAATGCAGTCAGAGGCAGGTGCCCATTCGGTTCGCCAGTCTCCGTTATTTACTACTTGGTACTGTTCCCAATCGTAATACGTTTCCTCTGGGTTGAAGATACTTGCTCTGCTCTTGACATTGCCCATAGCCACACCACGAGAAACATCGTGGAAGATTACGGAGCTGTTTACGATAGCAGGAGCAATGGATTTGACGAATGACACTTGATAAAAGTCTTTGGTCGAAAA'

chromosome = ""
for row in query.rows():
    if row["primaryIdentifier"] == "chrIII":
        chromosome = row["sequence.residues"]


def rev_comp(dna):
    comp = ""
    for nucl in dna:
        if nucl == "A":
            comp = comp + "T"
        elif nucl == "T":
            comp = comp + "A"
        elif nucl == "C":
            comp = comp + "G"
        elif nucl == "G":
            comp = comp + "C"
Пример #45
0
# This is an automatically generated script to run your query
# to use it you will require the intermine python client.
# To install the client, run the following command from a terminal:
#
#     sudo easy_install intermine
#
# For further documentation you can visit:
#     http://intermine.readthedocs.org/en/latest/web-services/

# The following two lines will be needed in every python script:
from intermine.webservice import Service
service = Service("https://yeastmine.yeastgenome.org:443/yeastmine/service",
                  token="YOUR-API-KEY")

# Get a new query on the class (table) you will be querying:
query = service.new_query("Gene")

# The view specifies the output columns
query.add_view("primaryIdentifier", "secondaryIdentifier",
               "organism.shortName", "symbol", "name")

# You can edit the constraint values below
query.add_constraint("Gene",
                     "IN",
                     "Gene list for S. cerevisiae 17 Dec 2018 11.4",
                     code="A")

# Uncomment and edit the code below to specify your own custom logic:
# query.set_logic("A")

for row in query.rows():
Пример #46
0
def generate_graph():

    service = Service("http://yeastmine.yeastgenome.org/yeastmine/service")

    # Get a new query on the class (table) you will be querying:
    query = service.new_query("Gene")

    # Type constraints should come early - before all mentions of the paths they constrain
    query.add_constraint("goAnnotation.ontologyTerm", "GOTerm")

    # The view specifies the output columns
    query.add_view(
        "secondaryIdentifier",
        "symbol",
        "goAnnotation.ontologyTerm.identifier",
        "description",
        "chromosome.primaryIdentifier",
        "chromosomeLocation.start",
        "chromosomeLocation.end",
        "chromosome.length",
    )

    # This query's custom sort order is specified below:
    query.add_sort_order("Gene.symbol", "ASC")

    # You can edit the constraint values below
    query.add_constraint("goAnnotation.qualifier", "IS NULL", code="C")
    query.add_constraint("goAnnotation.qualifier", "!=", "NOT", code="B")
    query.add_constraint("goAnnotation.ontologyTerm.name", "=", "cytoplasmic translation", code="A")
    query.add_constraint(
        "name", "ONE OF", ["Ribosomal Protein of the Large subunit", "Ribosomal Protein of the Small subunit"], code="D"
    )

    # Your custom constraint logic is specified with the code below:
    query.set_logic("A and (B or C) and D")

    chromosome = {}
    for row in query.rows():  # has all the data
        if row["chromosome.primaryIdentifier"] not in chromosome.keys():
            chromosome[row["chromosome.primaryIdentifier"]] = {"length": row["chromosome.length"], "genes": []}

        chromosome[row["chromosome.primaryIdentifier"]]["genes"].append(
            {"symbol": row["symbol"], "start": row["chromosomeLocation.start"], "end": row["chromosomeLocation.end"]}
        )

    all_chr_ids = chromosome.keys()
    all_chr_length = []
    for chr in all_chr_ids:
        all_chr_length.append(chromosome[chr]["length"])
    fig = plt.figure()
    fig.set_size_inches(25, 10)
    y_pos = range(len(all_chr_ids))
    plt.barh(
        y_pos, all_chr_length, align="center", alpha=0.4
    )  # draws the horizontal bar graph from every element in 'y-pos'(y-axis) to 'all_chr_length'(x-axis)
    plt.yticks(y_pos, all_chr_ids)  # Says position and label for y-axes

    plt.xlabel("Gene Positions")  # sets label for x-axes
    plt.title("Ribosomal protein Genes in Yeast Genome")

    chromosome_y_axis = 0
    for chr in all_chr_ids:
        for gene in chromosome[chr]["genes"]:
            posn = (gene["start"] + gene["end"]) / 2
            plt.plot(posn, chromosome_y_axis, "ro")
            plt.annotate(gene["symbol"], (posn - 2, chromosome_y_axis + 0.1), rotation="vertical")
        chromosome_y_axis += 1

    fig.savefig("static/rp_positions.svg")
# This is an automatically generated script to run your query
# to use it you will require the intermine python client.
# To install the client, run the following command from a terminal:
#
#     sudo easy_install intermine
#
# For further documentation you can visit:
#     http://intermine.readthedocs.org/en/latest/web-services/

# The following two lines will be needed in every python script:
from intermine.webservice import Service
service = Service("http://yeastmine.yeastgenome.org/yeastmine/service")

# Get a new query on the class (table) you will be querying:
query = service.new_query("ARS")

# The view specifies the output columns
query.add_view(
    "chromosome.primaryIdentifier", "chromosomeLocation.start",
    "chromosomeLocation.end", "secondaryIdentifier"
)

# Uncomment and edit the line below (the default) to select a custom sort order:
# query.add_sort_order("ARS.chromosome.primaryIdentifier", "ASC")
'''
for row in query.rows():
    print row["chromosome.primaryIdentifier"], row["chromosomeLocation.start"], \
        row["chromosomeLocation.end"], row["secondaryIdentifier"]
'''
chromosome_list = []
Пример #48
0
#!/usr/bin/python

from intermine.webservice import Service

service = Service('http://synbiomine.org/query/service')
query = service.new_query()
query.add_view('Gene.symbol', 'Gene.name') 
for row in query.results():
  print row
def get_chromosomal_coordinates_as_FASTA(chr_id, region_str, use_colon = False,
    extension_for_saving = extension_for_saving, return_text = False):
    '''
    Main function of script. 
    Takes a chromosome designation and coordinates and gets from
    YeastMine the sequence of that region of the chromsome as FASTA format.
    Saves or returns the genomic sequence of the gene in FASTA format.

    The coordinate order is used to signal which strand to get. Coordinates in 
    ascending order for the Watson strand and descending order for the Crick 
    strand as is the convention at https://www.yeastgenome.org/seqTools under
    'Search a specified chromosomal region of S288C genome'.

    Use `return_text` if calling from IPython or a Jupyter notebook and you want
    the FASTA record returned as text,
    '''

    # Parse the region_str to get the start and end positions of the reference 
    # sequence to specify what corresponding segment to extract from each of 
    # the aligned sequences. Handle strand to get be provided via order.
    #---------------------------------------------------------------------------
    if use_colon:
        coordinates_delimiter= ":"
    else:
        coordinates_delimiter = coordinates_delimiter_default
    region_str_parts = region_str.split(coordinates_delimiter)
    start, end = int(region_str_parts[0]), int(region_str_parts[1])
    # just fix if user was knowledgeable about Python and used zero to get to 
    # start because below I try to account for users using common numbering and 
    # it will substract and woould give negative numbers.
    if start == 0:
        start = 1
    if end == 0:
        end = 1
    
    # sanity checks
    assert start != end, (
    "The user-supplied 'start' ({}) and 'end' ({}) cannot be same value"
    ".".format(start,end))

    '''CANNOT USE HERE BECAUSE START CAN BE LARGER TO SIGNAL STRAND
    assert start < end, (
    "The user-supplied 'start' ({}) must be less than "
    "'end' ({}).".format(start,end))
    '''

    # overly explicit strand handling
    if start < end:
        get_watson_strand = True
        get_crick_strand = False
    else:
        get_watson_strand= False
        get_crick_strand= True
    # translate the strand info to YeastMine specifications
    if get_watson_strand:
        strand = 1
        strand_text = "Watson(1)"
        sys.stderr.write("Sequence on Watson strand specified...")
    elif get_crick_strand:
        strand = -1
        strand_text = "Crick(-1)"
        sys.stderr.write("Sequence on Crick strand specified...")
    else:
        sys.stderr.write("\n\nWhich strand?\n")
        sys.exit(1)

    # Get chromosome information from YeastMine
    #---------------------------------------------------------------------------
    # Based on the query I built at YeastMine to get sequence of chromosome and 
    # then can limit to coordinates needed after have entire sequence.
    
    service = Service("https://yeastmine.yeastgenome.org:443/yeastmine/service")

    # Get a new query on the class (table) you will be querying:
    query = service.new_query("Chromosome")

    # The view specifies the output columns
    query.add_view("sequence.residues")
    # constraint values
    chr_designation = "chr"+chr_id
    query.add_constraint("primaryIdentifier", "=", chr_designation, code = "A")


    rows = query.rows()
    results = []
    for row in rows:
        results.append(row)
    
    # store corresponding genomic sequence
    genomic_seq = (
        results[0]["sequence.residues"][min(start,end)-1: max(start,end)]) # the 
    #minus one is so user can provide coordinates in common terms but this
    #  adjusts for zero-indexing.
    # Make reverse complement if want crick strand BELOW after convert to
    # a biopython seq object so can use biopython `.reverse_complement` method


    
    # format chr_info for making output file name or anything else needing 
    # that information
    chr_info = {}
    chr_info['chr_nom'] = chr_designation
    chr_info['start'] = start
    chr_info['end'] = end
    #print (gene_nom_info['aliases'] ) # FOR DEBUGGING ONLY
    #print (gene_nom_info['std_nom'] ) # FOR DEBUGGING ONLY
    #print (gene_nom_info['sys_nom'] ) # FOR DEBUGGING ONLY


    # feedback
    sys.stderr.write("retrieving sequence from chromosome "
        "{}...".format(chr_id))


    # Make output FASTA record
    #---------------------------------------------------------------------------
    # based on handling worked out in 
    # `delete_seq_following_pattern_within_multiFASTA.py`
    # Description line loosely based on output from 
    # https://www.yeastgenome.org/seqTools under 'Search a specified chromosomal 
    # region of S288C genome'.
    record_description = 'coordinates {} to {}; strand is {}'.format(
        start, end, strand_text)
    record = SeqRecord(Seq(genomic_seq, generic_dna), 
            id=chr_designation, description=record_description)#based
        # on https://www.biostars.org/p/48797/ and `.ungap()` method, see
        # https://github.com/biopython/biopython/issues/1511 , and `description`
        # from what I've seen for `id` plus https://biopython.org/wiki/SeqIO
        #print (records[indx]) # ONLY FOR DEBUGGING
    # Make reverse complement if want crick strand after convert to
    # a biopython seq object so can use biopython `.reverse_complement` method
    if get_crick_strand:
        record = record.reverse_complement(id=True,description=True)
    sys.stderr.write("making FASTA formatted entry with retrieved sequence...")

    # Return text if called with `return_text = True`. Otherwise, consider 
    # called from command line & save file.
    #---------------------------------------------------------------------------
    if return_text == True:
        # based on section 4.6 at 
        #http://biopython.org/DIST/docs/tutorial/Tutorial.html#sec:SeqRecord-format
        # Feedback
        sys.stderr.write("\nReturning genomic sequence in FASTA format.")
        return record.format("fasta") 
    else:
        output_file_name = generate_output_file_name(
            chr_info,extension_for_saving)
        SeqIO.write(record,output_file_name, "fasta");
        # Feedback
        sys.stderr.write("\n\nFile of genomic sequence "
            "saved as '{}'.".format(output_file_name))
        sys.stderr.write("\nFinished.\n")
Пример #50
0
# Retrieve hierarchy from wormmine
# This version retrieves just a single child
# Requires intermine installed: $ easy_install intermine
# cf. http://intermine.wormbase.org/tools/wormmine/query.do for query construction
#-------------------------------------------------------------------------------
# USAGE: python RetrieveHierarchy.py > hierarchy.csv
#-------------------------------------------------------------------------------

# Get intermine service
from intermine.webservice import Service
service = Service("http://intermine.wormbase.org/tools/wormmine/service")

# Get a new query on the class (table) you will be querying:
query = service.new_query("AnatomyTerm")

# Specify the output columns:
query.add_view("name","synonym","primaryIdentifier","children.name", \
    "children.primaryIdentifier","children.synonym")

# Specify a custom sort order?:
# query.add_sort_order("AnatomyTerm.name", "ASC")

#-------------------------------------------------------------------------------
# Just print names and IDs:
for row in query.rows():
    print '{0}|{1}|{2}|{3}'.format(row["name"],row["primaryIdentifier"], \
                        row["children.name"],row["children.primaryIdentifier"])

# Names, synonyms, and IDs:
# for row in query.rows():
#     print row["name"],"(",row["synonym"],"): ",row["primaryIdentifier"],",",row["children.name"], \
# This is an automatically generated script to run your query
# to use it you will require the intermine python client.
# To install the client, run the following command from a terminal:
#
#     sudo easy_install intermine
#
# For further documentation you can visit:
#     http://intermine.readthedocs.org/en/latest/web-services/

# The following two lines will be needed in every python script:
from intermine.webservice import Service
service = Service("http://yeastmine.yeastgenome.org/yeastmine/service")

# Get a new query on the class (table) you will be querying:
query = service.new_query("Protein")

# The view specifies the output columns
query.add_view(
    "genes.primaryIdentifier", "genes.secondaryIdentifier", "symbol", "length",
    "molecularWeight", "pI", "genes.featureType", "genes.sgdAlias",
    "genes.description", "sequence.residues"
)

# You can edit the constraint values below
query.add_constraint("genes.featureType", "=", "intein_encoding_region", code = "H")
query.add_constraint("genes.featureType", "=", "blocked_reading_frame", code = "E")
query.add_constraint("genes.qualifier", "!=", "Dubious", code = "B")
query.add_constraint("genes.qualifier", "IS NULL", code = "C")
query.add_constraint("genes.status", "=", "Active", code = "D")
query.add_constraint("genes.featureType", "=", "ORF", code = "F")
else:

    # To run your query
    # to use it you will require the intermine python client.
    # To install the client, run the following command from a terminal:
    #
    #     sudo easy_install intermine
    #
    # For further documentation you can visit:
    #     http://intermine.readthedocs.org/en/latest/web-services/

    # The following two lines will be needed in every python script:
    from intermine.webservice import Service
    service = Service("https://yeastmine.yeastgenome.org:443/yeastmine/service") #seems current as of January 2018 from the YeastMine site example, I had also prior to this change in the script, run on my machine ` sudo easy_install intermine --upgrade`
    query = service.new_query("SequenceFeature")
    query.add_view(
        "primaryIdentifier", "featureType", "secondaryIdentifier", "description",
        "sgdAlias", "symbol"
    )
    query.add_constraint("featureType", "=", "telomerase_RNA_gene", code = "Z")
    query.add_constraint("qualifier", "IS NULL", code = "W")
    query.add_constraint("qualifier", "!=", "Dubious", code = "V")
    query.add_constraint("status", "=", "Active", code = "U")
    query.add_constraint("featureType", "=", "transposable_element_gene", code = "S")
    query.add_constraint("featureType", "=", "telomeric_repeat", code = "R")
    query.add_constraint("featureType", "=", "telomere", code = "Q")
    query.add_constraint("featureType", "=", "tRNA_gene", code = "P")
    query.add_constraint("featureType", "=", "snoRNA_gene", code = "O")
    query.add_constraint("featureType", "=", "snRNA_gene", code = "N")
    query.add_constraint("featureType", "=", "LTR_retrotransposon", code = "M")
Пример #53
0
def fetch_yeast_locus_sequence(locus_name, flanking_size=0):
    """Acquire a sequence from SGD http://www.yeastgenome.org.

    :param locus_name: Common name or systematic name for the locus (e.g. ACT1
                       or YFL039C).
    :type locus_name: str
    :param flanking_size: The length of flanking DNA (on each side) to return
    :type flanking_size: int

    """
    service = Service("http://yeastmine.yeastgenome.org/yeastmine/service")

    # Get a new query on the class (table) you will be querying:
    query = service.new_query("Gene")

    if flanking_size > 0:

        # The view specifies the output columns
        # secondaryIdentifier: the systematic name (e.g. YFL039C)
        # symbol: short name (e.g. ACT1)
        # length: sequence length
        # flankingRegions.direction: Upstream or downstream (or both) of locus
        # flankingRegions.sequence.length: length of the flanking regions
        # flankingRegions.sequence.residues: sequence of the flanking regions
        query.add_view("secondaryIdentifier", "symbol", "length",
                       "flankingRegions.direction",
                       "flankingRegions.sequence.length",
                       "flankingRegions.sequence.residues")

        # You can edit the constraint values below
        query.add_constraint("flankingRegions.direction", "=", "both",
                             code="A")
        query.add_constraint("Gene", "LOOKUP", locus_name, "S. cerevisiae",
                             code="B")
        query.add_constraint("flankingRegions.distance", "=",
                             "{:.1f}kb".format(flanking_size / 1000.),
                             code="C")
        # Uncomment and edit the code below to specify your own custom logic:
        query.set_logic("A and B and C")

        # TODO: What to do when there"s more than one result?
        first_result = query.rows().next()
        # FIXME: Use logger module instead
        # print first_result["secondaryIdentifier"]
        # print first_result["symbol"], row["length"]
        # print first_result["flankingRegions.direction"]
        # print first_result["flankingRegions.sequence.length"]
        # print first_result["flankingRegions.sequence.residues"]

        seq = coral.DNA(first_result["flankingRegions.sequence.residues"])
        # TODO: add more metadata

    elif flanking_size == 0:
        # The view specifies the output columns
        query.add_view("primaryIdentifier", "secondaryIdentifier", "symbol",
                       "name", "sgdAlias", "organism.shortName",
                       "sequence.length", "sequence.residues", "description",
                       "qualifier")

        query.add_constraint("status", "IS NULL", code="D")
        query.add_constraint("status", "=", "Active", code="C")
        query.add_constraint("qualifier", "IS NULL", code="B")
        query.add_constraint("qualifier", "!=", "Dubious", code="A")
        query.add_constraint("Gene", "LOOKUP", locus_name, "S. cerevisiae",
                             code="E")

        # Your custom constraint logic is specified with the code below:
        query.set_logic("(A or B) and (C or D) and E")

        first_result = query.rows().next()
        seq = coral.DNA(first_result["sequence.residues"])
    else:
        print "Problem with the flanking region size...."
        seq = coral.DNA("")

    return seq
def find_forkhead(chrom_ident, pattern):
    from intermine.webservice import Service
    service = Service("http://yeastmine.yeastgenome.org/yeastmine/service")

    # Get a new query on the class (table) you will be querying:
    query = service.new_query("Chromosome")

    # The view specifies the output columns
    query.add_view("primaryIdentifier", "sequence.residues")

    # Uncomment and edit the line below (the default) to select a custom sort order:
    # query.add_sort_order("Chromosome.primaryIdentifier", "ASC")

    chromosome = ''
    for row in query.rows():
        if row["primaryIdentifier"] == chrom_ident:
            chromosome = row["sequence.residues"]

    def rev_comp(dna):
        comp = ''
        for nucl in dna:
            if nucl == 'A':
                comp = comp + 'T'
            elif nucl == 'T':
                comp = comp + 'A'
            elif nucl == 'C':
                comp = comp + 'G'
            elif nucl == 'G':
                comp = comp + 'C'
            else:
                print('Not a DNA sequence')

        rev_comp = comp[::-1]
        return rev_comp

    chromosome_rev = rev_comp(chromosome)

    watson_finds = []
    crick_finds = []

    def find_pattern(pattern,seq):
        import regex
        find = regex.findall(pattern, seq)
        return find

    watson_finds =  find_pattern(pattern, chromosome)
    crick_finds = find_pattern(pattern, chromosome_rev)
    #acs = '([ATC][ATC][AT][AT]TTTA[TC][AG]TTT[AT]GTT){e<=1}'
    acs = '(AAC[TA]AAA[CT][GA]TAAA[AT][AT][GAT][GAT]){e<=1}'
    #acs = 'TTATATGTTTT'
    #acs = 'AAAACATATAA'
    import regex
    potential_origins = []

    for a in watson_finds:
        filtered_watson = []
        a = str(a)
        filtered_watson = regex.findall(acs, a)
        if len(filtered_watson) > 0:
            potential_origins.append(str(a))

    for a in crick_finds:
        filtered_crick = []
        a = str(a)
        filtered_crick = regex.findall(acs, a)
        if len(filtered_crick) > 0:
            potential_origins.append(a)

    print(potential_origins)

    motif_start_pos = []
    motif_end_pos = []
    motif_seq = []
    def find_motif(motif,seq):
        for a in range(0, len(seq)):
            slice = seq[a:a+len(motif)]
            if slice == motif:
                motif_start_pos.append(a+1) #+1 because first nuc is 1 not 0
                motif_end_pos.append((a+1) + len(motif))
                motif_seq.append(seq[a:a+len(motif)])

    for a in potential_origins:
        find_motif(a,chromosome)

    for a in potential_origins:
        a = rev_comp(a)
        find_motif(a,chromosome)

    count = 0

    with open('fkh_motifs_near_acs.csv', 'a') as f:
        for a in range(0, len(motif_start_pos)):
            match_pattern = "Fkh pattern: {},".format(pattern)
            chrom_location = "{}, {}, {},".format(chrom_ident, str(motif_start_pos[a]), str(motif_end_pos[a]))
            match_sequence = "{}\n".format(str(motif_seq[a]))
            f.write(match_pattern)
            f.write(chrom_location)
            f.write(match_sequence)
Пример #55
0
from intermine.webservice import Service

from ete3 import NCBITaxa
ncbi = NCBITaxa()
#ncbi.update_taxonomy_database()

service = Service("https://phytozome.jgi.doe.gov/phytomine/service")
query = service.new_query("Organism")
query.add_view("annotationVersion", "assemblyVersion", "commonName", "genus",
               "name", "proteomeId", "shortName", "species", "taxonId",
               "version")
k = [
    "proteomeId", "commonName", "name", "shortName", "annotationVersion",
    "assemblyVersion", "genus", "species", "taxonId", "version"
]
t = [
    "superkingdom", "kingdom", "phylum", "class", "subclass", "order",
    "family", "genus", "species"
]
print("\t".join(k + t + ["full_lineage"]))


def filterRanks(L):
    subset = {ncbi.get_rank([x])[x]: x for x in L}
    #return([if x in subset: ncbi.get_taxid_translator([x])[x] else: "NA" for x in t])
    return ([
        list(ncbi.get_taxid_translator([subset[x]]).values())[0]
        if x in subset else 'NA' for x in t
    ])

Пример #56
0
# This is an automatically generated script to run your query
# to use it you will require the intermine python client.
# To install the client, run the following command from a terminal:
#
#     sudo easy_install intermine
#
# For further documentation you can visit:
#     http://www.intermine.org/wiki/PythonClient

# The following two lines will be needed in every python script:
from intermine.webservice import Service
service = Service("http://www.mousemine.org/mousemine/service")

# Get a new query on the class (table) you will be querying:
query = service.new_query("OntologyTerm")

# Type constraints should come early - before all mentions of the paths they constrain
query.add_constraint("ontologyAnnotations.subject", "Genotype")

# The view specifies the output columns
query.add_view(
    "identifier", "name", "namespace",
    "ontologyAnnotations.subject.primaryIdentifier",
    "ontologyAnnotations.subject.name", "ontologyAnnotations.qualifier",
    "ontologyAnnotations.evidence.code.code",
    "ontologyAnnotations.evidence.publications.mgiJnum"
)

# Uncomment and edit the line below (the default) to select a custom sort order:
# query.add_sort_order("OntologyTerm.identifier", "ASC")
Пример #57
0
from intermine.webservice import Service
service = Service("http://yeastmine.yeastgenome.org/yeastmine/service", token = "YOUR-API-KEY")
query = service.new_query("Gene")
query.add_view(
    "primaryIdentifier", "secondaryIdentifier", "organism.shortName", "symbol",
    "name"
)
query.add_constraint("Gene", "IN", "systematic gene names", code = "A")

for row in query.rows():
    print(row["primaryIdentifier"], row["secondaryIdentifier"], row["organism.shortName"], \
        row["symbol"], row["name"])
def find_forkhead(chrom_ident, pattern):
    from intermine.webservice import Service
    service = Service("http://yeastmine.yeastgenome.org/yeastmine/service")

    # Get a new query on the class (table) you will be querying:
    query = service.new_query("Chromosome")

    # The view specifies the output columns
    query.add_view("primaryIdentifier", "sequence.residues")

    # Uncomment and edit the line below (the default) to select a custom sort order:
    # query.add_sort_order("Chromosome.primaryIdentifier", "ASC")
    #chrIII = 'GATTACAGGGAATTTGTTTAATAGCAATTTATACGCTTTGTTATCGGCACCACCAAATTCTGGGATAACCGTTAATTCTTCCTCAGGTTTGCCTAGTGGATCCTCTCCTTCTGGAGTTTGGCCACGCTCTGGCTTTTCGATCAGACTTGGCATGTGACTAATCAAGTATGGCATGCTGGTTTTTGGGTCCTTTGTTTTCGTTGTTTCAGTCTGGATAAATTTTAAGTTACCATTATCGAAGGCACTTTTGTACTTGTCACTAATTAAAGATGCAATGTCAGCGGGGATACTCATTTTTATTTTAATTTTTACTTTTCTGTTTGTTCTAAAATCTATCTAAACTGGCTTTCAAGATCAATCTATTGTCTTTTAAGGTAAACTTTAAATTGGAAATAATAGTAATGTTAGTTCCTTCATTTTAACCTTGTATTGTATTTCCTTTGCGTGATGAAAAAAAAACTGAAAAAGAGAAAAATAAGAAAATCTTCTAGAACGTTCCGAAACAGGACACTTAGCACACAAATACAGAATAGGAAAGTAAAAGGCAATATATGAATGCAGTGCTTGTAACTGGTGCTTGTATCCAAGAATAGCTTCTTGCTGTAGGTTATGGGAATATCGTGTAAGCTGGGGTGACTTTTGAGCTATTCGCGACGCCCGACGCCGTAATAACTACTTTCGACAGACCACTTATGACAGTATTTCAGGCCGCTCTTATAAAATGACATGTTAACAAACAGTTCTGATTATTCGCCTTTTGACAGGACGATAATGTAAATAGTTGTGGTAGTATCATTCAGGTATGTAACTGTTTACTTTGTATCGCTTGAAAAAAATAAGCATTTCAGAGCCTTCTTTGGAGCTCAAGTGGATTGAGGCCACAGCAAGACCGGCCAGTTTGAATGCTCAACTCTTCAAAAGAAATTCCTCAAATATGTCCAGTTTCATGTACTGTCCGGTGTGATTTATTATTTTTTATTTACTTTGTAGTTCTTAAAGCTAAGATTTTTTTCTTTGATAAATTCTTGTTTTCATATCCTAAAATTAAAGGGAAAATAAACAATACATAACAAAACATATAAAAACCAACACAATAAAAAAAAGGATCAAATACTCATTAAAGTAACTTACACGGGGGCTAAAAACGGAGTTTGATGAATATTCACAAGATAAAAATCATATGTATGTTTCTGATATATCGATATACAATCAAACACTTTCAAGAATTTGTTTGTAGACTTTTTGCTAGAGACCTCATCAAAGTGCTACCAACTAAGATCAACTTATACTTCTTTTAGAGAAAATTTTTTTCAATGTACTCCAAAGAGATTTAGATCCTGTCTCTTCCTCTTCCTCTTCCTCGAAAGTCAAAGAAAAATCAGAGTCTCCCTGCTTATTCAGGCGGAGAGGCTCTAGGGTAGTTGCGTTTCTCTCATTGGGACACTGAACCTCATTTTCCAACATTTTGGTCATGTAAGAGGCGACAGGCTCATCGCAGGTGGGTGCATCAACATGGTAGTACCTGGACCAAGCGCTACATTGAGTCCCTCCTGGATAAACACCGCTACAATATTGTCTTTGGACGTTTGCCCAAACCATATCTTTTGAATACCAAAGCTGGACCACATTGTATGGCCTAATCATTGGTGCTACCATAATACTGGATTGGGAAACAGTCTGGTTAATTTTTTTCAACCAATTTTTCTTATCTAGCAATGATTTAATAAACCTGAAATCTAAATTGTCTTCGTTAGCGTCTGTGTCATAATCTACAATTGAGTACTGTGACGTCCAATTATATGGCACCGAGATGGGGAATCTGTCCGGTGTTTCGTCGCTGTTATCCTTCTCCTCCCTCCAAATGCAGTCAGAGGCAGGTGCCCATTCGGTTCGCCAGTCTCCGTTATTTACTACTTGGTACTGTTCCCAATCGTAATACGTTTCCTCTGGGTTGAAGATACTTGCTCTGCTCTTGACATTGCCCATAGCCACACCACGAGAAACATCGTGGAAGATTACGGAGCTGTTTACGATAGCAGGAGCAATGGATTTGACGAATGACACTTGATAAAAGTCTTTGGTCGAAAA'

    chromosome = ''
    for row in query.rows():
        if row["primaryIdentifier"] == chrom_ident:
            chromosome = row["sequence.residues"]

    def rev_comp(dna):
        comp = ''
        for nucl in dna:
            if nucl == 'A':
                comp = comp + 'T'
            elif nucl == 'T':
                comp = comp + 'A'
            elif nucl == 'C':
                comp = comp + 'G'
            elif nucl == 'G':
                comp = comp + 'C'
            else:
                print 'Not a DNA sequence'

        rev_comp = comp[::-1]
        return rev_comp

    chromosome_rev = rev_comp(chromosome)



    #seq = 'AAACAGGACACTTAGCACACAAATACAGAATAGGAAAGTAAAAGGCAATATATGAATGCAGTGCTTGTAACTGGTGCTTGTATCCAAGAATAGCTTCTTGCTGTAGGTTATGGGAATATCGTGTAAGCTGGGGTGACTTTTGAGCTATTCGCGACGCCCGACGCCGTAATAACTACTTTCGACAGACCACTTATGACAGTATTTCAGGCCGCTCTTATAAAATGACATGTTAACAAACAGTTCTGATTATTCGCCTTTTGACAGGACGATAATGTAAATAGTTGTGGTAGTATCATTCAGGTATGTAACTGTTTACTTTGTATCGCTTGAAAAAAATAAGCATTTCAGAGCCTTCTTTGGAGCTCAAGTGGATTGAGGCCACAGCAAGACCGGCCAGTTTGAATGCTCAACTCTTCAAAAGAAATTCCTCAAATATGTCCAGTTTCATGTACTGTCCGGTGTGATTTATTATTTTTTATTTACTTTGTAGTTCTTAAAGCTAAGATTTTTTTCTTTGATAAATTCTTGTTTTCATATCCTAAAATTAAAGGGAAAATAAACAATACATAACAAAACATATAAAAACCAACACAATAAAAAAAAGGATCAAATACTCATTAAAGTAACTTACACGGGGGCTAAAAACGGAGTTTGATGAATATTCACAAGATAAAAATCATATGTATGTTTCTGATATATCGATATACAATCAAACACTTTCAAGAATTTGTTTGTAGACTTTTTGCTAGAGACCTCATCAAAGTGCTACCAACTAAGATCAACTTATACTTCTTTTAGAGAAAATTTTTTTCAATGTACTCCAAAGAGATTTAGATCCTGTCTCTTCCTCTTCCTCTTCCTCGAAAGTCAAAGAAAAATCAGAGTCTCCCTGCTTATTCAGGCGGAGAGGCTCTAGGGTAGTTGCGTTTCTCTCATTGGGACACTGAACCTCATTTTCCAACATTTTGGTCATGTAAGAGGCGACAGGCTCATCGCAGGTGGGTGCATCAACATGGTAGTACCTGGACCAAGCGCTACATTGAGTCCCTCCTGGATAAACACCGCTACAATATTGTCTTTGGACGTTT'
    #seq_rev = rev_comp(seq)
    watson_finds = []
    crick_finds = []

    def find_pattern(pattern,seq):
        import regex
        find = regex.findall(pattern, seq)
        return find



    watson_finds =  find_pattern(pattern, chromosome)
    crick_finds = find_pattern(pattern, chromosome_rev)
    #WWWWTTTAYRTTTWGTT
    #acs = '([ATC][ATC][AT][AT]TTTA[TC][AG]TTT[AT]GTT){e<=1}'
    acs = '(AAC[TA]AAA[CT][GA]TAAA[AT][AT][GAT][GAT]){e<=1}'
    #acs = 'TTATATGTTTT'
    #acs = 'AAAACATATAA'
    import regex
    potential_origins = []

    for a in watson_finds:
        filtered_watson = []
        a = str(a)
        filtered_watson = regex.findall(acs, a)
        if len(filtered_watson) > 0:
            potential_origins.append(str(a))

    for a in crick_finds:
        filtered_crick = []
        a = str(a)
        filtered_crick = regex.findall(acs, a)
        if len(filtered_crick) > 0:
            potential_origins.append(a)

    print potential_origins

    motif_start_pos = []
    motif_end_pos = []
    motif_seq = []
    def find_motif(motif,seq):
        for a in range(0, len(seq)):
            slice = seq[a:a+len(motif)]
            if slice == motif:
                motif_start_pos.append(a+1) #+1 because first nuc is 1 not 0
                motif_end_pos.append((a+1) + len(motif))
                motif_seq.append(seq[a:a+len(motif)])

    for a in potential_origins:
        find_motif(a,chromosome)

    for a in potential_origins:
        a = rev_comp(a)
        find_motif(a,chromosome)

    count = 0
    f1 = open('Out_table_1.txt', 'a')
    for a in range(0, len(motif_start_pos)):
        f1.write(chrom_ident)
        f1.write(' ')
        motif_midpoint = (motif_start_pos[a] + motif_end_pos[a]) / 2
        f1.write(str(motif_midpoint))
        f1.write(' ')
        f1.write(str(motif_seq[a]))
        f1.write('\n')