def sgd_connection(gene, p_dir, l_dir): # load gene phenotype data from SGD database service = Service( 'https://yeastmine.yeastgenome.org:443/yeastmine/service') a = service.new_query('Gene') view_list = [ 'primaryIdentifier', 'symbol', 'secondaryIdentifier', 'sgdAlias', 'qualifier', 'phenotypes.experimentType', 'phenotypes.mutantType', 'phenotypes.observable', 'phenotypes.qualifier', 'phenotypes.allele', 'phenotypes.alleleComment', 'phenotypes.strainBackground', 'phenotypes.chemical', 'phenotypes.condition', 'phenotypes.details', 'phenotypes.reporter', 'phenotypes.publications.pubMedId', 'phenotypes.publications.citation' ] for item in view_list: a.add_view(item) a.add_constraint('organism.shortName', '=', 'S. cerevisiae', code='B') a.add_constraint('Gene', 'LOOKUP', gene, code='A') phenotype_line = 'Gene Primary DBID\tGene Standard Name\tGene Systematic Name\t' \ 'Gene Sgd Alias\tGene Qualifier\tPhenotypes Experiment Type\t' \ 'Phenotypes Mutant Type\tPhenotypes Observable\tPhenotypes Qualifier\t' \ 'Phenotypes Allele\tPhenotypes Allele Comment\tPhenotypes Strain Background\t' \ 'Phenotypes Chemical\tPhenotypes Condition\tPhenotypes Details\t' \ 'Phenotypes Reporter\tPublications PubMed ID\tPublications Citation\n' p_result_file = os.path.join(p_dir, '{0}.txt'.format(gene)) with open(p_result_file, 'w', encoding='utf-8') as f1: for row in a.rows(): result_line = '' for k in view_list: result_line += '{0}\t'.format(str(row[k])) phenotype_line += result_line.strip() + '\n' f1.write(phenotype_line) # Load phenotype summary b = service.new_query('Gene') b.add_view('phenotypes.genes.phenotypeSummary') b.add_constraint('organism.shortName', '=', 'S. cerevisiae', code='B') b.add_constraint('Gene', 'LOOKUP', gene, code='A') summary = '' for row in b.rows(): p_result = row['phenotypes.genes.phenotypeSummary'] if p_result: summary += p_result result_list = [gene, summary] # Load PubMed id c = service.new_query('Gene') c.add_view('publicationAnnotations.publication.pubMedId') c.add_constraint('organism.shortName', '=', 'S. cerevisiae', code='B') c.add_constraint('Gene', 'LOOKUP', gene, code='A') l_result_file = os.path.join(l_dir, '{0}.txt'.format(gene)) with open(l_result_file, 'w', encoding='utf-8') as f2: for row in c.rows(): pubmed_id = row['publicationAnnotations.publication.pubMedId'] if pubmed_id: handle = pubmed_connection(pubmed_id, gene) if handle: f2.write(handle.read()) return result_list
def get_gene_id(gene_name): '''Retrieve systematic yeast gene name from the common name. :param gene_name: Common name for yeast gene (e.g. ADE2). :type gene_name: str :returns: Systematic name for yeast gene (e.g. YOR128C). :rtype: str ''' from intermine.webservice import Service service = Service('http://yeastmine.yeastgenome.org/yeastmine/service') # Get a new query on the class (table) you will be querying: query = service.new_query('Gene') # The view specifies the output columns query.add_view('primaryIdentifier', 'secondaryIdentifier', 'symbol', 'name', 'sgdAlias', 'crossReferences.identifier', 'crossReferences.source.name') # Uncomment and edit the line below (the default) to select a custom sort # order: # query.add_sort_order('Gene.primaryIdentifier', 'ASC') # You can edit the constraint values below query.add_constraint('organism.shortName', '=', 'S. cerevisiae', code='B') query.add_constraint('Gene', 'LOOKUP', gene_name, code='A') # Uncomment and edit the code below to specify your own custom logic: # query.set_logic('A and B') for row in query.rows(): gid = row['secondaryIdentifier'] return gid
def get_gene_id(gene_name): """Retrieve systematic yeast gene name from the common name. :param gene_name: Common name for yeast gene (e.g. ADE2). :type gene_name: str :returns: Systematic name for yeast gene (e.g. YOR128C). :rtype: str """ service = Service("http://yeastmine.yeastgenome.org/yeastmine/service") # Get a new query on the class (table) you will be querying: query = service.new_query("Gene") # The view specifies the output columns query.add_view("primaryIdentifier", "secondaryIdentifier", "symbol", "name", "sgdAlias", "crossReferences.identifier", "crossReferences.source.name") # Uncomment and edit the line below (the default) to select a custom sort # order: # query.add_sort_order("Gene.primaryIdentifier", "ASC") # You can edit the constraint values below query.add_constraint("organism.shortName", "=", "S. cerevisiae", code="B") query.add_constraint("Gene", "LOOKUP", gene_name, code="A") # Uncomment and edit the code below to specify your own custom logic: # query.set_logic("A and B") for row in query.rows(): gid = row["secondaryIdentifier"] return gid
def get_all_gene_annotations(): service = Service( "https://yeastmine.yeastgenome.org:443/yeastmine/service") query = service.new_query("Gene") col_names = [ "briefDescription", "description", "functionSummary", "chromosome.primaryIdentifier", "secondaryIdentifier", "symbol", "phenotypeSummary", "locations.strand", "locations.end", "locations.start" ] query.add_view(col_names) seen_orfs = set() col_dicts = {c: [] for c in col_names} for row in query.rows(): # for some reason rows are repeated in the yeastmine output, so I deduplicate them here if row['secondaryIdentifier'] not in seen_orfs: for c in col_names: col_dicts[c].append(row[c]) seen_orfs.add(row['secondaryIdentifier']) name_shortener = { 'chromosome.primaryIdentifier': 'chromosome', 'secondaryIdentifier': 'ORF', 'symbol': 'Gene', 'locations.start': 'start', 'locations.end': 'end', 'locations.strand': 'orf_strand' } td = pd.DataFrame(col_dicts).rename(columns=name_shortener) td['Gene_ORF'] = td.apply(lambda row: gene_orfer(row), axis=1) return td
def main(): """Connects to yeastmine and creates a dictionary of annotation data. Data is saved into shelve as well as returned.""" #print("annotations.SGD.yeastmine.main:") service = Service("http://yeastmine.yeastgenome.org/yeastmine") query = service.new_query() query.add_view( "SequenceFeature.primaryIdentifier", "SequenceFeature.featureType", "SequenceFeature.secondaryIdentifier", "SequenceFeature.description", "SequenceFeature.sgdAlias", "SequenceFeature.name", "SequenceFeature.symbol", "SequenceFeature.chromosome.name", "SequenceFeature.chromosome.featAttribute", "SequenceFeature.locations.start", "SequenceFeature.locations.end", "SequenceFeature.locations.strand" ) query.add_constraint("SequenceFeature.organism.name", "=", "Saccharomyces cerevisiae", "A") query.add_constraint("SequenceFeature.featureType", "=", "ORF", "B") query.set_logic("(A and B)") annotation = {} #print("settins.PROJECT_ROOT: %s" % settings.PROJECT_ROOT) #print("os.path.join: %s" % os.path.join(os.path.join(settings.PROJECT_ROOT, 'apps', 'annotations', 'SGD', 'yeastmine'))) db = shelve.open(os.path.join(settings.PROJECT_ROOT, 'apps', 'annotations', 'SGD', 'yeastmine'), 'c') for row in query.rows(): data = {} for x in xrange(0, len(row.views)): attribute = row.views[x].split('.')[-1] value = row.data[x]['value'] if attribute == 'name' and not value: continue data[attribute] = value if 'name' not in data: data['name'] = None annotation[data['secondaryIdentifier']] = data db[str(data['secondaryIdentifier'])] = data db.close() return annotation
def intermine_query(ids, organism, *args): service = Service(service_urls[organism]) query = service.new_query("Gene", case_sensitive=True) query.add_constraint("Gene", "LOOKUP", ids, code="A") query.add_constraint("organism.name", "=", organism, code="B") query.select(*args) return query
def getInteractions(): service = Service("http://yeastmine.yeastgenome.org/yeastmine/service") # Get a new query on the class (table) you will be querying: query = service.new_query("Gene") # Type constraints should come early - before all mentions of the paths they constrain query.add_constraint("goAnnotation.ontologyTerm", "GOTerm") # The view specifies the output columns query.add_view( "symbol", "interactions.details.experimentType", "interactions.gene2.symbol", "interactions.gene2.briefDescription" ) # You can edit the constraint values below query.add_constraint("goAnnotation.qualifier", "IS NULL", code = "C") query.add_constraint("goAnnotation.qualifier", "!=", "NOT", code = "B") query.add_constraint("goAnnotation.ontologyTerm.name", "=", "cytoplasmic translation", code = "A") query.add_constraint("name", "ONE OF", ["Ribosomal Protein of the Large subunit", "Ribosomal Protein of the Small subunit"], code = "D") query.add_constraint("interactions.details.annotationType", "=", "manually curated", code = "E") # Your custom constraint logic is specified with the code below: query.set_logic("A and (B or C) and E and D") interactions = {} for row in query.rows(): if row["symbol"] not in interactions.keys(): interactions[row["symbol"]] = [{ "expt" : row["interactions.details.experimentType"], "gene2": row["interactions.gene2.symbol"],"desc":row["interactions.gene2.briefDescription"]}] else: interactions[row["symbol"]].append({ "expt": row["interactions.details.experimentType"], "gene2": row["interactions.gene2.symbol"],"desc":row["interactions.gene2.briefDescription"]}) return interactions
def wmquery(): service = Service("http://intermine.wormbase.org/tools/wormmine/service") query = service.new_query("Gene") query.add_view( "biotype", "length", "symbol", "primaryIdentifier", "downstreamIntergenicRegion.primaryIdentifier", "downstreamIntergenicRegion.organism.name", "downstreamIntergenicRegion.locations.feature.primaryIdentifier", "downstreamIntergenicRegion.locations.start", "downstreamIntergenicRegion.locations.end", "downstreamIntergenicRegion.locations.strand", "homologues.dataSets.name", "upstreamIntergenicRegion.primaryIdentifier", "upstreamIntergenicRegion.organism.name", "upstreamIntergenicRegion.locations.feature.primaryIdentifier", "upstreamIntergenicRegion.locations.start", "upstreamIntergenicRegion.locations.end", "upstreamIntergenicRegion.locations.strand", "transcripts.primaryIdentifier", "transcripts.symbol") for row in query.rows(): print (row["biotype"], row["length"], row["symbol"], row["primaryIdentifier"], \ row["downstreamIntergenicRegion.primaryIdentifier"], \ row["downstreamIntergenicRegion.organism.name"], \ row["downstreamIntergenicRegion.locations.feature.primaryIdentifier"], \ row["downstreamIntergenicRegion.locations.start"], \ row["downstreamIntergenicRegion.locations.end"], \ row["downstreamIntergenicRegion.locations.strand"], row["homologues.dataSets.name"], \ row["upstreamIntergenicRegion.primaryIdentifier"], \ row["upstreamIntergenicRegion.organism.name"], \ row["upstreamIntergenicRegion.locations.feature.primaryIdentifier"], \ row["upstreamIntergenicRegion.locations.start"], \ row["upstreamIntergenicRegion.locations.end"], \ row["upstreamIntergenicRegion.locations.strand"], row["transcripts.primaryIdentifier"], \ row["transcripts.symbol"])
def index_genes(organism, mod): backup_filename = organism + "mine_genes_" + time.strftime("%m_%d_%Y") + ".bkp" if os.path.isfile(backup_filename): print "Restoring fetched data from today from " + organism + "mine" backup = open(backup_filename, 'rb') genes = pickle.load(backup) else: print "Fetching data from " + organism + "mine" service = Service(mod["mine_service_url"]) query = service.new_query("Gene") query.add_view(mod["gene_fields"].values()) query.add_constraint("organism.name", "=", mod["mine_organism_name"], code="B") rows = query.rows() genes = {} for row in rows: id = row[mod["gene_fields"]["id"]] if id in genes: genes[id]["go_ids"].append(row[mod["gene_fields"]["go_id"]]) genes[id]["go_names"].append(row[mod["gene_fields"]["go_name"]]) else: genes[id] = { "name": row[mod["gene_fields"]["gene_name"]], "symbol": row[mod["gene_fields"]["gene_symbol"]], "synonym": row[mod["gene_fields"]["gene_synonym"]], "go_ids": [row[mod["gene_fields"]["go_id"]]], "go_names": [row[mod["gene_fields"]["go_name"]]], "href": mod["url_prefix"] + row["primaryIdentifier"] + mod["url_suffix"], "organism": organism, "category": "gene" } with open(backup_filename, 'wb') as backup: pickle.dump(genes, backup) print "Indexing " + str(len(genes)) + " " + organism + " genes" bulk_data = [] for gene in genes.keys(): bulk_data.append({ 'index': { '_index': INDEX_NAME, '_type': DOC_TYPE, '_id': organism + "_" + gene } }) bulk_data.append(genes[gene]) if len(bulk_data) % 500 == 0: es.bulk(index=INDEX_NAME, body=bulk_data, refresh=True) bulk_data = [] if len(bulk_data) > 0: es.bulk(index=INDEX_NAME, body=bulk_data, refresh=True)
def search_SGD(self, gene_code=None): service = Service("http://yeastmine.yeastgenome.org/yeastmine/service") query = service.new_query("Gene") query.add_view( "chromosome.primaryIdentifier", "chromosomeLocation.end", "chromosomeLocation.start", "chromosomeLocation.strand", "secondaryIdentifier", ) query.add_constraint("symbol", "=", gene_code, code="A") for row in query.rows(): print( [ row["secondaryIdentifier"], row["chromosome.primaryIdentifier"], row["chromosomeLocation.start"], row["chromosomeLocation.end"], "+" if row["chromosomeLocation.strand"] else "-", ] ) return [ row["secondaryIdentifier"], row["chromosome.primaryIdentifier"][3:], row["chromosomeLocation.start"], row["chromosomeLocation.end"], "+" if row["chromosomeLocation.strand"] else "-", ]
def fetch_from_sgd() -> dict: """Query SGD's intermine service and return an up-to-date dict of S. Cerevisiae features (genes). Returned is a dictionary of "SGD_ID" -> dict of feature data. Keys in feature data are: sgd_id, feature_qualifier, feature_type, orf, name, aliases, chromosome, chromosomal_location, start_coordinate, stop_coordinate, description :rtype: dict """ re_num = re.compile(r'(\d+)') service = Service("https://yeastmine.yeastgenome.org/yeastmine/service") query = service.new_query("Gene") query.add_view("primaryIdentifier", "featureType", "qualifier", "secondaryIdentifier", "symbol", "chromosomeLocation.start", "chromosomeLocation.end", "description", "synonyms.value") query.add_constraint("organism.shortName", "=", "S. cerevisiae", code="A") query.add_constraint("featureType", "=", "ORF", code="C") genes = {} logger.debug("Executing query on yeastmine") for row in query.rows(): sgd_id = row["primaryIdentifier"] orf = row["secondaryIdentifier"] orfnum = re_num.findall(orf) if orfnum: orfnum = int(orfnum[0]) else: orfnum = 0 if orf.startswith('Q'): chrom = 0 else: chrom = ord(orf[1]) - 64 if orf[2] == 'L': orfnum = -orfnum if sgd_id not in genes: logger.debug(f"Parsing new ORF: {orf}") genes[sgd_id] = { 'sgd_id': row["primaryIdentifier"], 'feature_qualifier': row["qualifier"], 'feature_type': row['featureType'], 'orf': orf, 'name': row["symbol"], 'aliases': [], 'chromosome': chrom, 'chromosomal_location': orfnum, 'start_coordinate': str(row["chromosomeLocation.start"]), 'stop_coordinate': str(row["chromosomeLocation.end"]), 'description': row["description"], } if row["synonyms.value"] not in (orf, row["symbol"]): genes[sgd_id]['aliases'].append(row["synonyms.value"]) return genes
def query_fishmine(intermine_url: str, protein_id: str, query: str="Gene") -> IntermineResult: service = Service(intermine_url) query = service.new_query(query) query.add_view("primaryIdentifier") query.add_constraint("primaryIdentifier", "CONTAINS", "ZDB*", code="A") query.add_constraint("crossReferences.identifier", "=", "{}".format(protein_id), code="B") result_list = ["ZFIN:{}".format(val['primaryIdentifier']) for val in query.rows()] return intermine_response_factory(result_list, protein_id)
def parse(self, limit=None): count = 0 for num in range(10, 100): fuzzy_gene = "MGI:{0}*".format(num) gene = "MGI:{0}".format(num) service = Service("http://www.mousemine.org/mousemine/service") logging.getLogger('Model').setLevel(logging.CRITICAL) logging.getLogger('JSONIterator').setLevel(logging.CRITICAL) query = service.new_query("OntologyAnnotation") query.add_constraint("subject", "SequenceFeature") query.add_constraint("ontologyTerm", "MPTerm") query.add_view("subject.primaryIdentifier", "subject.symbol", "subject.sequenceOntologyTerm.name", "ontologyTerm.identifier", "ontologyTerm.name", "evidence.publications.pubMedId", "evidence.comments.type", "evidence.comments.description") query.add_sort_order("OntologyAnnotation.ontologyTerm.name", "ASC") query.add_constraint("subject.organism.taxonId", "=", "10090", code="A") query.add_constraint("subject", "LOOKUP", fuzzy_gene, code="B") query.add_constraint("subject.primaryIdentifier", "CONTAINS", gene, code="C") query.outerjoin("evidence.comments") for row in query.rows(): mgi_curie = row["subject.primaryIdentifier"] mp_curie = row["ontologyTerm.identifier"] pub_curie = "PMID:{0}".format( row["evidence.publications.pubMedId"]) assoc = G2PAssoc(self.graph, self.name, mgi_curie, mp_curie) if row["evidence.publications.pubMedId"]: reference = Reference( self.graph, pub_curie, Reference.ref_types['journal_article']) reference.addRefToGraph() assoc.add_source(pub_curie) assoc.add_evidence('ECO:0000059') assoc.add_association_to_graph() if not count % 10 and count != 0: count_from = count - 10 logger.info( "{0} processed ids from MGI:{1}* to MGI:{2}*".format( datetime.datetime.now(), count_from, count)) count += 1 if limit and count >= limit: break return
def get_yeast_gene_location(gene_name): '''Acquire the location of a gene from SGD http://www.yeastgenome.org :param gene_name: Name of the gene. :type gene_name: string :returns location: [int: chromosome, int:biostart, int:bioend, int:strand] :rtype location: list ''' from intermine.webservice import Service service = Service('http://yeastmine.yeastgenome.org/yeastmine/service') # Get a new query on the class (table) you will be querying: query = service.new_query('Gene') # The view specifies the output columns query.add_view('primaryIdentifier', 'secondaryIdentifier', 'symbol', 'name', 'organism.shortName', 'chromosome.primaryIdentifier', 'chromosomeLocation.start', 'chromosomeLocation.end', 'chromosomeLocation.strand') # Uncomment and edit the line below (the default) to select a custom sort # order: # query.add_sort_order('Gene.primaryIdentifier', 'ASC') # You can edit the constraint values below query.add_constraint('organism.shortName', '=', 'S. cerevisiae', code='B') query.add_constraint('Gene', 'LOOKUP', gene_name, code='A') # Uncomment and edit the code below to specify your own custom logic: # query.set_logic('A and B') chromosomes = { 'chrI': 1, 'chrII': 2, 'chrIII': 3, 'chrIV': 4, 'chrV': 5, 'chrVI': 6, 'chrVII': 7, 'chrVIII': 8, 'chrIX': 9, 'chrX': 10, 'chrXI': 11, 'chrXII': 12, 'chrXIII': 13, 'chrXIV': 14, 'chrXV': 15, 'chrXVI': 16 } first_result = query.rows().next() return [ chromosomes[first_result['chromosome.primaryIdentifier']], first_result['chromosomeLocation.start'], first_result['chromosomeLocation.end'], int(first_result['chromosomeLocation.strand']) ]
def download(self, genes, fields, scope=None, species=None): ''' Retrives the data depending on self.constraints and self.view ''' constraints = self.constraints views = self.views glist = np.array(genes) if len(glist) > 1000: a = len(glist) / 1000 segs = np.array_split(glist, a) else: segs = [glist] # store the data in here z = [] # API uses letters to distinguish between constraints alpha = list(string.ascii_uppercase) for seg in segs: # Connect to the API service = SS(self.datasource) query = service.new_query("Gene") query.add_view(",".join(views)) # Some databases require a host name if self.hostid != "": query.add_constraint("Gene", "LOOKUP", ",".join(seg), self.hostid, code="A") else: query.add_constraint("Gene", "LOOKUP", ",".join(seg), code="A") # Apply the constraints if len(constraints) != 0: i = 1 for constraint in constraints: letter = alpha[i] if len(constraint.split("=")) == 2: L = constraint.split("=") query.add_constraint(L[0], "=", L[1], code=letter) elif re.search("IS NOT NULL", constraint): p1 = constraint.replace(" IS NOT NULL", "") query.add_constraint(p1, "IS NOT NULL", code=letter) i = i + 1 # Parse the output into a list of tuples j = 0 for row in query.rows(): t = [row['symbol']] for v in views: t.append(row[v]) z.append(tuple(t)) j += 1 self.dataset = z
def find_max_data_items(new_list, intermine, intermine_url): service = Service(intermine_url + "/service") max = 0 for i in new_list: query = service.new_query(i) query.add_view(i + ".*") if (query.count() >= max): max = query.count() return max
def query(ids): service = Service("http://targetmine.nibio.go.jp/targetmine") query = service.new_query("Protein") query.add_view("primaryIdentifier", "primaryAccession", "name", "length", "compounds.compound.casRegistryNumber", "compounds.compound.name", "compounds.compound.compoundGroup.name") test_id = ids[0] query.add_constraint("Protein", "IN", ",".join(ids)) return query.rows()
def main(): if not os.path.exists("results"): os.makedirs("results") service = Service("https://apps.araport.org/thalemine/service") file = open("results/all_genes.csv", "w") list_written = [] list_genes = [] list_gene_names = [] for index, line in enumerate(open(os.getcwd() + "/" + sys.argv[1])): gene = line.strip() query = service.new_query("Gene") query.add_view("primaryIdentifier", "RNASeqExpressions.expressionLevel", "RNASeqExpressions.experiment.SRAaccession", "RNASeqExpressions.experiment.tissue", "RNASeqExpressions.unit") query.add_sort_order("Gene.RNASeqExpressions.experiment.SRAaccession", "DESC") query.add_constraint("primaryIdentifier", "=", gene, code="A") for row in query.rows(): experiment_tissue = str( row["RNASeqExpressions.experiment.SRAaccession"]) + "-" + str( row["RNASeqExpressions.experiment.tissue"]) expression_value = str(row["RNASeqExpressions.expressionLevel"]) if experiment_tissue not in list_written: list_written.append(experiment_tissue) list_genes.append((gene, experiment_tissue, expression_value)) list_gene_names.append(gene) for item in list_written: file.write("\t" + item) file.write("\n") flag = 0 for gene_name in list_gene_names: file.write(gene_name) for item in list_written: flag = 0 for gene_name_temp, exp_tissue, expression_value in list_genes: if gene_name == gene_name_temp: if item == exp_tissue: file.write("\t" + expression_value) flag = 1 break if flag == 0: file.write("\t0") file.write("\n") file.close()
def get_yeast_gene_location(gene_name): """Acquire the location of a gene from SGD http://www.yeastgenome.org :param gene_name: Name of the gene. :type gene_name: string :returns location: [int: chromosome, int:biostart, int:bioend, int:strand] :rtype location: list """ service = Service("http://yeastmine.yeastgenome.org/yeastmine/service") # Get a new query on the class (table) you will be querying: query = service.new_query("Gene") # The view specifies the output columns query.add_view("primaryIdentifier", "secondaryIdentifier", "symbol", "name", "organism.shortName", "chromosome.primaryIdentifier", "chromosomeLocation.start", "chromosomeLocation.end", "chromosomeLocation.strand") # Uncomment and edit the line below (the default) to select a custom sort # order: # query.add_sort_order("Gene.primaryIdentifier", "ASC") # You can edit the constraint values below query.add_constraint("organism.shortName", "=", "S. cerevisiae", code="B") query.add_constraint("Gene", "LOOKUP", gene_name, code="A") # Uncomment and edit the code below to specify your own custom logic: # query.set_logic("A and B") chromosomes = { "chrI": 1, "chrII": 2, "chrIII": 3, "chrIV": 4, "chrV": 5, "chrVI": 6, "chrVII": 7, "chrVIII": 8, "chrIX": 9, "chrX": 10, "chrXI": 11, "chrXII": 12, "chrXIII": 13, "chrXIV": 14, "chrXV": 15, "chrXVI": 16 } first_result = query.rows().next() return [ chromosomes[first_result["chromosome.primaryIdentifier"]], first_result["chromosomeLocation.start"], first_result["chromosomeLocation.end"], int(first_result["chromosomeLocation.strand"]) ]
def get_yeast_gene_location(gene_name): '''Acquire the location of a gene from SGD http://www.yeastgenome.org :param gene_name: Name of the gene. :type gene_name: string :returns location: [int: chromosome, int:biostart, int:bioend, int:strand] :rtype location: list ''' from intermine.webservice import Service service = Service('http://yeastmine.yeastgenome.org/yeastmine/service') # Get a new query on the class (table) you will be querying: query = service.new_query('Gene') # The view specifies the output columns query.add_view('primaryIdentifier', 'secondaryIdentifier', 'symbol', 'name', 'organism.shortName', 'chromosome.primaryIdentifier', 'chromosomeLocation.start', 'chromosomeLocation.end', 'chromosomeLocation.strand') # Uncomment and edit the line below (the default) to select a custom sort # order: # query.add_sort_order('Gene.primaryIdentifier', 'ASC') # You can edit the constraint values below query.add_constraint('organism.shortName', '=', 'S. cerevisiae', code='B') query.add_constraint('Gene', 'LOOKUP', gene_name, code='A') # Uncomment and edit the code below to specify your own custom logic: # query.set_logic('A and B') chromosomes = {'chrI': 1, 'chrII': 2, 'chrIII': 3, 'chrIV': 4, 'chrV': 5, 'chrVI': 6, 'chrVII': 7, 'chrVIII': 8, 'chrIX': 9, 'chrX': 10, 'chrXI': 11, 'chrXII': 12, 'chrXIII': 13, 'chrXIV': 14, 'chrXV': 15, 'chrXVI': 16} first_result = query.rows().next() return [chromosomes[first_result['chromosome.primaryIdentifier']], first_result['chromosomeLocation.start'], first_result['chromosomeLocation.end'], int(first_result['chromosomeLocation.strand'])]
def query(ids): service = Service("http://targetmine.nibio.go.jp/targetmine") query = service.new_query("Protein") query.add_view( "primaryIdentifier", "primaryAccession", "name", "length", "compounds.compound.casRegistryNumber", "compounds.compound.name", "compounds.compound.compoundGroup.name" ) test_id = ids[0] query.add_constraint("Protein", "IN", ",".join(ids)) return query.rows()
def get_yeast_gene_location(gene_name): """Acquire the location of a gene from SGD http://www.yeastgenome.org :param gene_name: Name of the gene. :type gene_name: string :returns location: [int: chromosome, int:biostart, int:bioend, int:strand] :rtype location: list """ service = Service("http://yeastmine.yeastgenome.org/yeastmine/service") # Get a new query on the class (table) you will be querying: query = service.new_query("Gene") # The view specifies the output columns query.add_view("primaryIdentifier", "secondaryIdentifier", "symbol", "name", "organism.shortName", "chromosome.primaryIdentifier", "chromosomeLocation.start", "chromosomeLocation.end", "chromosomeLocation.strand") # Uncomment and edit the line below (the default) to select a custom sort # order: # query.add_sort_order("Gene.primaryIdentifier", "ASC") # You can edit the constraint values below query.add_constraint("organism.shortName", "=", "S. cerevisiae", code="B") query.add_constraint("Gene", "LOOKUP", gene_name, code="A") # Uncomment and edit the code below to specify your own custom logic: # query.set_logic("A and B") chromosomes = {"chrI": 1, "chrII": 2, "chrIII": 3, "chrIV": 4, "chrV": 5, "chrVI": 6, "chrVII": 7, "chrVIII": 8, "chrIX": 9, "chrX": 10, "chrXI": 11, "chrXII": 12, "chrXIII": 13, "chrXIV": 14, "chrXV": 15, "chrXVI": 16} first_result = query.rows().next() return [chromosomes[first_result["chromosome.primaryIdentifier"]], first_result["chromosomeLocation.start"], first_result["chromosomeLocation.end"], int(first_result["chromosomeLocation.strand"])]
def intermine_query(type): from intermine.webservice import Service service = Service("http://yeastmine.yeastgenome.org/yeastmine/service") # Get a new query on the class (table) you will be querying: query = service.new_query(type) # The view specifies the output columns query.add_view("primaryIdentifier", "sequence.residues") # Uncomment and edit the line below (the default) to select a custom sort order: # query.add_sort_order("Chromosome.primaryIdentifier", "ASC") return query
def parse(self, limit=None): count = 0 for num in range(10, 100): fuzzy_gene = "MGI:{0}*".format(num) gene = "MGI:{0}".format(num) service = Service("http://www.mousemine.org/mousemine/service") logging.getLogger('Model').setLevel(logging.ERROR) logging.getLogger('JSONIterator').setLevel(logging.ERROR) query = service.new_query("OntologyAnnotation") query.add_constraint("subject", "SequenceFeature") query.add_constraint("ontologyTerm", "MPTerm") query.add_view( "subject.primaryIdentifier", "subject.symbol", "subject.sequenceOntologyTerm.name", "ontologyTerm.identifier", "ontologyTerm.name", "evidence.publications.pubMedId", "evidence.comments.type", "evidence.comments.description" ) query.add_sort_order("OntologyAnnotation.ontologyTerm.name", "ASC") query.add_constraint("subject.organism.taxonId", "=", self.txid, code="A") query.add_constraint("subject", "LOOKUP", fuzzy_gene, code="B") query.add_constraint( "subject.primaryIdentifier", "CONTAINS", gene, code="C") query.outerjoin("evidence.comments") for row in query.rows(): mgi_curie = row["subject.primaryIdentifier"] mp_curie = row["ontologyTerm.identifier"] pub_curie = "PMID:{0}".format(row["evidence.publications.pubMedId"]) assoc = G2PAssoc(self.graph, self.name, mgi_curie, mp_curie) if row["evidence.publications.pubMedId"]: reference = Reference( self.graph, pub_curie, self.globaltt['journal article']) reference.addRefToGraph() assoc.add_source(pub_curie) assoc.add_evidence(self.globaltt['experimental phenotypic evidence']) assoc.add_association_to_graph() if not count % 10 and count != 0: count_from = count - 10 LOG.info( "%s processed ids from MGI:%i* to MGI:%i*", datetime.datetime.now(), count_from, count) count += 1 if limit and count >= limit: break return
def query_mousemine(intermine_url: str, gene_id: str) -> IntermineResult: """ :param intermine_url: intermine server, eg http://www.mousemine.org/mousemine/service :param gene_id: gene ID, eg ENSMUSG00000063180 :return: Intermine_Result object """ service = Service(intermine_url) query = service.new_query("SequenceFeature") query.add_view("primaryIdentifier") query.add_constraint("SequenceFeature", "LOOKUP", "{}".format(gene_id), code="A") query.add_constraint("organism.shortName", "=", "M. musculus", code="B") result_list = ["{}".format(val['primaryIdentifier']) for val in query.rows()] return intermine_response_factory(result_list, gene_id)
def query_fishmine(intermine_url: str, protein_id: str, query: str = "Gene") -> IntermineResult: service = Service(intermine_url) query = service.new_query(query) query.add_view("primaryIdentifier") query.add_constraint("primaryIdentifier", "CONTAINS", "ZDB*", code="A") query.add_constraint("crossReferences.identifier", "=", "{}".format(protein_id), code="B") result_list = [ "ZFIN:{}".format(val['primaryIdentifier']) for val in query.rows() ] return intermine_response_factory(result_list, protein_id)
def test(self): ''' Tests the HumanMine API Look up symbol for APOBEC3G, should return APOBEC3G. ''' service = SS('http://www.humanmine.org/humanmine/service') query = service.new_query("Gene") query.add_view("symbol") query.add_constraint("Gene", "LOOKUP", "APOBEC3G", code="A") for row in query.rows(): symbol = row['symbol'] if symbol == "APOBEC3G": return 1 else: return 0
def query_intermine(genes): genes = ', '.join(genes) from intermine.webservice import Service service = Service("http://www.mousemine.org/mousemine/service") query = service.new_query("OntologyAnnotation") query.add_constraint("ontologyTerm", "MPTerm") query.add_constraint("subject", "SequenceFeature") query.add_view("subject.primaryIdentifier", "subject.symbol", "subject.sequenceOntologyTerm.name", "ontologyTerm.identifier", "ontologyTerm.name", "evidence.publications.pubMedId", "evidence.comments.type", "evidence.comments.description") query.add_sort_order("OntologyAnnotation.ontologyTerm.name", "ASC") query.add_constraint("subject.organism.taxonId", "=", "10090", code="A") query.add_constraint("subject", "LOOKUP", genes, code="B") query.outerjoin("evidence.comments") return query
def query_mousemine(intermine_url: str, gene_id: str) -> IntermineResult: """ :param intermine_url: intermine server, eg http://www.mousemine.org/mousemine/service :param gene_id: gene ID, eg ENSMUSG00000063180 :return: Intermine_Result object """ service = Service(intermine_url) query = service.new_query("SequenceFeature") query.add_view("primaryIdentifier") query.add_constraint("SequenceFeature", "LOOKUP", "{}".format(gene_id), code="A") query.add_constraint("organism.shortName", "=", "M. musculus", code="B") result_list = [ "{}".format(val['primaryIdentifier']) for val in query.rows() ] return intermine_response_factory(result_list, gene_id)
def query_mousemine_to_create_mouse_het_lethal_knockout_genes(): service = Service("http://www.mousemine.org/mousemine/service") query = service.new_query("OntologyAnnotation") query.add_constraint("ontologyTerm", "MPTerm") query.add_constraint("subject", "SequenceFeature") query.add_constraint("evidence.baseAnnotations.subject", "Genotype") query.add_view("subject.primaryIdentifier", "subject.symbol", "evidence.baseAnnotations.subject.symbol", "evidence.baseAnnotations.subject.background.name", "evidence.baseAnnotations.subject.zygosity", "ontologyTerm.identifier", "ontologyTerm.name") query.add_sort_order("OntologyAnnotation.subject.symbol", "ASC") query.add_constraint("evidence.baseAnnotations.subject.zygosity", "=", "ht", code="B") query.add_constraint("ontologyTerm.name", "CONTAINS", "lethal", code="A") headers = [ 'Subject Primary Identifier', 'Ontology Annotation Subject . Symbol', 'Base Annotations Subject . Symbol', 'Subject Background', 'Subject Zygosity', 'Ontology Annotation Ontology Term . Identifier', 'Ontology Annotation Term Name', ] table = [headers] for query_row in query.rows(): row = [ query_row["subject.primaryIdentifier"], query_row["subject.symbol"], query_row["evidence.baseAnnotations.subject.symbol"], query_row["evidence.baseAnnotations.subject.background.name"], query_row["evidence.baseAnnotations.subject.zygosity"], query_row["ontologyTerm.identifier"], query_row["ontologyTerm.name"], ] table.append(row) output_csv = SOURCE_DATA_FOLDER + 'mouse_het_lethal.tsv' write_table_to_csv(table, output_csv, delimiter='\t')
def getData(mine): """ A function to get datasets corresponding to a mine ================================================ example: >>> from intermine import registry >>> registry.getData('flymine') Name: Affymetrix array: Drosophila1 Name: Affymetrix array: Drosophila2 Name: Affymetrix array: GeneChip Drosophila Genome 2.0 Array Name: Affymetrix array: GeneChip Drosophila Genome Array Name: Anoph-Expr data set Name: BDGP cDNA clone data set..... """ x = "http://registry.intermine.org/service/instances/" + mine try: r = requests.get(x) dict = json.loads(r.text) link = dict["instance"]["url"] service = Service(link) query = service.new_query("DataSet") query.add_view("name", "url") list = [] for row in query.rows(): try: list.append(row["name"]) except KeyError: print("No info available") list.sort() for i in range(len(list)): print("Name: " + list[i]) return None except KeyError: return "No such mine available"
def mine(self, category): locus_id = self.cleaned_data['locus_id'] locus_id = locus_id.split('\r\n') locus_id = {'id': locus_id} df = pd.DataFrame(locus_id) from intermine.webservice import Service service = Service("https://phytozome.jgi.doe.gov/phytomine/service") query = service.new_query(category) query.add_view("name", "primaryIdentifier", "secondaryIdentifier", "sequence.residues") output = '' for i in df['id']: query.add_constraint("name", "=", i, code="D") for row in query.rows(): output = output + (">"+str(row["primaryIdentifier"])+'\n'+\ row["sequence.residues"]+'\n') return output
def mine(self): promoters = self.cleaned_data['promoter'] size = self.cleaned_data['size'] promoters = promoters.split('\r\n') promoters = {'id': promoters} df = pd.DataFrame(promoters) from intermine.webservice import Service service = Service("https://phytozome.jgi.doe.gov/phytomine/service") query = service.new_query("Gene") query.add_view("name", "primaryIdentifier", "secondaryIdentifier", "length", "flankingRegions.length", "flankingRegions.includeGene", "flankingRegions.direction", "flankingRegions.primaryIdentifier", "flankingRegions.sequence.length", "flankingRegions.sequence.residues") query.add_constraint("flankingRegions.length", "=", "5000", code="A") query.add_constraint("flankingRegions.includeGene", "=", "false", code="B") query.add_constraint("flankingRegions.direction", "=", "upstream", code="C") output = '' for i in df['id']: query.add_constraint("name", "=", i, code="D") for row in query.rows(): output = output + (">"+str(row["primaryIdentifier"])+'\n'+\ str(row["flankingRegions.sequence.residues"][5000-size:])+'\n') return output
with open("three.seq", "w") as seq_file_three: three_constant_type_A = 'GTAATTAACCCTCACTAAAGGG' three_constant_type_B = 'CTAATTAACCCTCACTAAAGGG' three_end = data_set[1].replace(three_constant_type_A, '').replace( three_constant_type_B, '') seq = Seq(three_end) data_set[1] = str(seq.reverse_complement()) seq_file_three.write(data_set[1]) # Write 5' probe to five.seq with open("five.seq", "w") as seq_file_five: seq_file_five.write(data_set[2]) # Need to fetch the transcript sequences for the respective gene from FlyMine: service = Service("http://www.flymine.org/flymine/service") query = service.new_query("Gene") query.add_view("transcripts.primaryIdentifier", "transcripts.length") query.add_sort_order("transcripts.length", "DESC") query.add_constraint("Gene", "LOOKUP", data_set[0], code="A") transcript_primary_identifiers = [] # For each transcript found for a given gene: for row in query.rows(): print("") print("TRANSCRIPT ID: ", row["transcripts.primaryIdentifier"], "||", "TRANSCRIPT LENGTH: ", row["transcripts.length"]) transcript_primary_identifiers.append( row["transcripts.primaryIdentifier"]) print("") print("")
def fetch_yeast_locus_sequence(locus_name, flanking_size=0): '''Acquire a sequence from SGD http://www.yeastgenome.org. :param locus_name: Common name or systematic name for the locus (e.g. ACT1 or YFL039C). :type locus_name: str :param flanking_size: The length of flanking DNA (on each side) to return :type flanking_size: int ''' from intermine.webservice import Service service = Service('http://yeastmine.yeastgenome.org/yeastmine/service') # Get a new query on the class (table) you will be querying: query = service.new_query('Gene') if flanking_size > 0: # The view specifies the output columns # secondaryIdentifier: the systematic name (e.g. YFL039C) # symbol: short name (e.g. ACT1) # length: sequence length # flankingRegions.direction: Upstream or downstream (or both) of locus # flankingRegions.sequence.length: length of the flanking regions # flankingRegions.sequence.residues: sequence of the flanking regions query.add_view('secondaryIdentifier', 'symbol', 'length', 'flankingRegions.direction', 'flankingRegions.sequence.length', 'flankingRegions.sequence.residues') # You can edit the constraint values below query.add_constraint('flankingRegions.direction', '=', 'both', code='A') query.add_constraint('Gene', 'LOOKUP', locus_name, 'S. cerevisiae', code='B') query.add_constraint('flankingRegions.distance', '=', '{:.1f}kb'.format(flanking_size / 1000.), code='C') # Uncomment and edit the code below to specify your own custom logic: query.set_logic('A and B and C') # TODO: What to do when there's more than one result? first_result = query.rows().next() # FIXME: Use logger module instead # print first_result['secondaryIdentifier'] # print first_result['symbol'], row['length'] # print first_result['flankingRegions.direction'] # print first_result['flankingRegions.sequence.length'] # print first_result['flankingRegions.sequence.residues'] seq = coral.DNA(first_result['flankingRegions.sequence.residues']) # TODO: add more metadata elif flanking_size == 0: # The view specifies the output columns query.add_view('primaryIdentifier', 'secondaryIdentifier', 'symbol', 'name', 'sgdAlias', 'organism.shortName', 'sequence.length', 'sequence.residues', 'description', 'qualifier') query.add_constraint('status', 'IS NULL', code='D') query.add_constraint('status', '=', 'Active', code='C') query.add_constraint('qualifier', 'IS NULL', code='B') query.add_constraint('qualifier', '!=', 'Dubious', code='A') query.add_constraint('Gene', 'LOOKUP', locus_name, 'S. cerevisiae', code='E') # Your custom constraint logic is specified with the code below: query.set_logic('(A or B) and (C or D) and E') first_result = query.rows().next() seq = coral.DNA(first_result['sequence.residues']) else: print 'Problem with the flanking region size....' seq = coral.DNA('') return seq
else: # To run your query # to use it you will require the intermine python client. # To install the client, run the following command from a terminal: # # sudo easy_install intermine # # For further documentation you can visit: # http://intermine.readthedocs.org/en/latest/web-services/ # The following two lines will be needed in every python script: from intermine.webservice import Service service = Service("http://yeastmine.yeastgenome.org/yeastmine/service") query = service.new_query("SequenceFeature") query.add_view( "primaryIdentifier", "featureType", "secondaryIdentifier", "description", "sgdAlias", "symbol" ) query.add_constraint("featureType", "=", "telomerase_RNA_gene", code = "Z") query.add_constraint("qualifier", "IS NULL", code = "W") query.add_constraint("qualifier", "!=", "Dubious", code = "V") query.add_constraint("status", "=", "Active", code = "U") query.add_constraint("featureType", "=", "transposable_element_gene", code = "S") query.add_constraint("featureType", "=", "telomeric_repeat", code = "R") query.add_constraint("featureType", "=", "telomere", code = "Q") query.add_constraint("featureType", "=", "tRNA_gene", code = "P") query.add_constraint("featureType", "=", "snoRNA_gene", code = "O") query.add_constraint("featureType", "=", "snRNA_gene", code = "N") query.add_constraint("featureType", "=", "LTR_retrotransposon", code = "M")
def find_forkhead(chrom_ident, pattern): from intermine.webservice import Service service = Service("http://yeastmine.yeastgenome.org/yeastmine/service") # Get a new query on the class (table) you will be querying: query = service.new_query("Chromosome") # The view specifies the output columns query.add_view("primaryIdentifier", "sequence.residues") # Uncomment and edit the line below (the default) to select a custom sort order: # query.add_sort_order("Chromosome.primaryIdentifier", "ASC") query.add_constraint("Chromosome.primaryIdentifier", "=", chrom_ident) # Return one result and raise error if there is more than one query.one() chromosome = '' for row in query.rows(): chromosome = row["sequence.residues"] def rev_comp(dna): comp = '' for nucl in dna: if nucl == 'A': comp = comp + 'T' elif nucl == 'T': comp = comp + 'A' elif nucl == 'C': comp = comp + 'G' elif nucl == 'G': comp = comp + 'C' else: print('Not a DNA sequence') rev_comp = comp[::-1] return rev_comp chromosome_rev = rev_comp(chromosome) import regex watson_matches = regex.finditer(pattern["pattern"], chromosome) crick_matches = regex.finditer(pattern["pattern"], chromosome_rev) all_matches = [match.group(0) for match in watson_matches] all_matches += [rev_comp(match.group(0)) for match in crick_matches] all_matches = list(set(all_matches)) fkh_motif_coords = [] for match in all_matches: matches = regex.finditer(match, chromosome) match_starts = [match.start() for match in matches] fkh_motif_coords += match_starts fkh_motif_coords.sort() print("Pattern {} matches: {}".format(pattern["name"], fkh_motif_coords)) file_name = "output_data\{}_fkh_motifs.csv".format(chrom_ident) if len(fkh_motif_coords) > 0: with open(file_name, 'a') as f: for position in fkh_motif_coords: f.write("{},{},{}\n".format(chrom_ident, position, pattern["name"]))
gene_name_list = args.gene_name_list # read the taxon ids taxon_ids = {} with open('data/phytozome_species_ids.txt') as csv_file: t_ids = csv.reader(csv_file, delimiter=',') for row in t_ids: taxon_ids[row[0]] = row[1] # read the gene names with open(gene_name_list, 'r') as f: gene_names = [line.strip() for line in f] # run the Phytomine query service = Service('https://phytozome.jgi.doe.gov/phytomine/service') query = service.new_query('Gene') views = ["length", "organism.taxonId", "primaryIdentifier", "organism.shortName", "ontologyAnnotations.ontologyTerm.identifier", "ontologyAnnotations.ontologyTerm.name", "ontologyAnnotations.ontologyTerm.namespace"] query.add_view(views) query.add_constraint('primaryIdentifier', 'ONE OF', gene_names, code='A') query.add_constraint('organism.taxonId', 'ONE OF', list(taxon_ids.values()), code='B') query.set_logic("A and B") # write results tmp = tempfile.mkstemp(suffix=".txt", text=True)[1] sep = "\t" with open(tmp, 'w') as outfile:
from intermine.webservice import Service service = Service("http://yeastmine.yeastgenome.org/yeastmine/service") # Get a new query on the class (table) you will be querying: query = service.new_query("Chromosome") # The view specifies the output columns query.add_view("primaryIdentifier", "sequence.residues") # Uncomment and edit the line below (the default) to select a custom sort order: # query.add_sort_order("Chromosome.primaryIdentifier", "ASC") # chrIII = 'GATTACAGGGAATTTGTTTAATAGCAATTTATACGCTTTGTTATCGGCACCACCAAATTCTGGGATAACCGTTAATTCTTCCTCAGGTTTGCCTAGTGGATCCTCTCCTTCTGGAGTTTGGCCACGCTCTGGCTTTTCGATCAGACTTGGCATGTGACTAATCAAGTATGGCATGCTGGTTTTTGGGTCCTTTGTTTTCGTTGTTTCAGTCTGGATAAATTTTAAGTTACCATTATCGAAGGCACTTTTGTACTTGTCACTAATTAAAGATGCAATGTCAGCGGGGATACTCATTTTTATTTTAATTTTTACTTTTCTGTTTGTTCTAAAATCTATCTAAACTGGCTTTCAAGATCAATCTATTGTCTTTTAAGGTAAACTTTAAATTGGAAATAATAGTAATGTTAGTTCCTTCATTTTAACCTTGTATTGTATTTCCTTTGCGTGATGAAAAAAAAACTGAAAAAGAGAAAAATAAGAAAATCTTCTAGAACGTTCCGAAACAGGACACTTAGCACACAAATACAGAATAGGAAAGTAAAAGGCAATATATGAATGCAGTGCTTGTAACTGGTGCTTGTATCCAAGAATAGCTTCTTGCTGTAGGTTATGGGAATATCGTGTAAGCTGGGGTGACTTTTGAGCTATTCGCGACGCCCGACGCCGTAATAACTACTTTCGACAGACCACTTATGACAGTATTTCAGGCCGCTCTTATAAAATGACATGTTAACAAACAGTTCTGATTATTCGCCTTTTGACAGGACGATAATGTAAATAGTTGTGGTAGTATCATTCAGGTATGTAACTGTTTACTTTGTATCGCTTGAAAAAAATAAGCATTTCAGAGCCTTCTTTGGAGCTCAAGTGGATTGAGGCCACAGCAAGACCGGCCAGTTTGAATGCTCAACTCTTCAAAAGAAATTCCTCAAATATGTCCAGTTTCATGTACTGTCCGGTGTGATTTATTATTTTTTATTTACTTTGTAGTTCTTAAAGCTAAGATTTTTTTCTTTGATAAATTCTTGTTTTCATATCCTAAAATTAAAGGGAAAATAAACAATACATAACAAAACATATAAAAACCAACACAATAAAAAAAAGGATCAAATACTCATTAAAGTAACTTACACGGGGGCTAAAAACGGAGTTTGATGAATATTCACAAGATAAAAATCATATGTATGTTTCTGATATATCGATATACAATCAAACACTTTCAAGAATTTGTTTGTAGACTTTTTGCTAGAGACCTCATCAAAGTGCTACCAACTAAGATCAACTTATACTTCTTTTAGAGAAAATTTTTTTCAATGTACTCCAAAGAGATTTAGATCCTGTCTCTTCCTCTTCCTCTTCCTCGAAAGTCAAAGAAAAATCAGAGTCTCCCTGCTTATTCAGGCGGAGAGGCTCTAGGGTAGTTGCGTTTCTCTCATTGGGACACTGAACCTCATTTTCCAACATTTTGGTCATGTAAGAGGCGACAGGCTCATCGCAGGTGGGTGCATCAACATGGTAGTACCTGGACCAAGCGCTACATTGAGTCCCTCCTGGATAAACACCGCTACAATATTGTCTTTGGACGTTTGCCCAAACCATATCTTTTGAATACCAAAGCTGGACCACATTGTATGGCCTAATCATTGGTGCTACCATAATACTGGATTGGGAAACAGTCTGGTTAATTTTTTTCAACCAATTTTTCTTATCTAGCAATGATTTAATAAACCTGAAATCTAAATTGTCTTCGTTAGCGTCTGTGTCATAATCTACAATTGAGTACTGTGACGTCCAATTATATGGCACCGAGATGGGGAATCTGTCCGGTGTTTCGTCGCTGTTATCCTTCTCCTCCCTCCAAATGCAGTCAGAGGCAGGTGCCCATTCGGTTCGCCAGTCTCCGTTATTTACTACTTGGTACTGTTCCCAATCGTAATACGTTTCCTCTGGGTTGAAGATACTTGCTCTGCTCTTGACATTGCCCATAGCCACACCACGAGAAACATCGTGGAAGATTACGGAGCTGTTTACGATAGCAGGAGCAATGGATTTGACGAATGACACTTGATAAAAGTCTTTGGTCGAAAA' chromosome = "" for row in query.rows(): if row["primaryIdentifier"] == "chrIII": chromosome = row["sequence.residues"] def rev_comp(dna): comp = "" for nucl in dna: if nucl == "A": comp = comp + "T" elif nucl == "T": comp = comp + "A" elif nucl == "C": comp = comp + "G" elif nucl == "G": comp = comp + "C"
# This is an automatically generated script to run your query # to use it you will require the intermine python client. # To install the client, run the following command from a terminal: # # sudo easy_install intermine # # For further documentation you can visit: # http://intermine.readthedocs.org/en/latest/web-services/ # The following two lines will be needed in every python script: from intermine.webservice import Service service = Service("https://yeastmine.yeastgenome.org:443/yeastmine/service", token="YOUR-API-KEY") # Get a new query on the class (table) you will be querying: query = service.new_query("Gene") # The view specifies the output columns query.add_view("primaryIdentifier", "secondaryIdentifier", "organism.shortName", "symbol", "name") # You can edit the constraint values below query.add_constraint("Gene", "IN", "Gene list for S. cerevisiae 17 Dec 2018 11.4", code="A") # Uncomment and edit the code below to specify your own custom logic: # query.set_logic("A") for row in query.rows():
def generate_graph(): service = Service("http://yeastmine.yeastgenome.org/yeastmine/service") # Get a new query on the class (table) you will be querying: query = service.new_query("Gene") # Type constraints should come early - before all mentions of the paths they constrain query.add_constraint("goAnnotation.ontologyTerm", "GOTerm") # The view specifies the output columns query.add_view( "secondaryIdentifier", "symbol", "goAnnotation.ontologyTerm.identifier", "description", "chromosome.primaryIdentifier", "chromosomeLocation.start", "chromosomeLocation.end", "chromosome.length", ) # This query's custom sort order is specified below: query.add_sort_order("Gene.symbol", "ASC") # You can edit the constraint values below query.add_constraint("goAnnotation.qualifier", "IS NULL", code="C") query.add_constraint("goAnnotation.qualifier", "!=", "NOT", code="B") query.add_constraint("goAnnotation.ontologyTerm.name", "=", "cytoplasmic translation", code="A") query.add_constraint( "name", "ONE OF", ["Ribosomal Protein of the Large subunit", "Ribosomal Protein of the Small subunit"], code="D" ) # Your custom constraint logic is specified with the code below: query.set_logic("A and (B or C) and D") chromosome = {} for row in query.rows(): # has all the data if row["chromosome.primaryIdentifier"] not in chromosome.keys(): chromosome[row["chromosome.primaryIdentifier"]] = {"length": row["chromosome.length"], "genes": []} chromosome[row["chromosome.primaryIdentifier"]]["genes"].append( {"symbol": row["symbol"], "start": row["chromosomeLocation.start"], "end": row["chromosomeLocation.end"]} ) all_chr_ids = chromosome.keys() all_chr_length = [] for chr in all_chr_ids: all_chr_length.append(chromosome[chr]["length"]) fig = plt.figure() fig.set_size_inches(25, 10) y_pos = range(len(all_chr_ids)) plt.barh( y_pos, all_chr_length, align="center", alpha=0.4 ) # draws the horizontal bar graph from every element in 'y-pos'(y-axis) to 'all_chr_length'(x-axis) plt.yticks(y_pos, all_chr_ids) # Says position and label for y-axes plt.xlabel("Gene Positions") # sets label for x-axes plt.title("Ribosomal protein Genes in Yeast Genome") chromosome_y_axis = 0 for chr in all_chr_ids: for gene in chromosome[chr]["genes"]: posn = (gene["start"] + gene["end"]) / 2 plt.plot(posn, chromosome_y_axis, "ro") plt.annotate(gene["symbol"], (posn - 2, chromosome_y_axis + 0.1), rotation="vertical") chromosome_y_axis += 1 fig.savefig("static/rp_positions.svg")
# This is an automatically generated script to run your query # to use it you will require the intermine python client. # To install the client, run the following command from a terminal: # # sudo easy_install intermine # # For further documentation you can visit: # http://intermine.readthedocs.org/en/latest/web-services/ # The following two lines will be needed in every python script: from intermine.webservice import Service service = Service("http://yeastmine.yeastgenome.org/yeastmine/service") # Get a new query on the class (table) you will be querying: query = service.new_query("ARS") # The view specifies the output columns query.add_view( "chromosome.primaryIdentifier", "chromosomeLocation.start", "chromosomeLocation.end", "secondaryIdentifier" ) # Uncomment and edit the line below (the default) to select a custom sort order: # query.add_sort_order("ARS.chromosome.primaryIdentifier", "ASC") ''' for row in query.rows(): print row["chromosome.primaryIdentifier"], row["chromosomeLocation.start"], \ row["chromosomeLocation.end"], row["secondaryIdentifier"] ''' chromosome_list = []
#!/usr/bin/python from intermine.webservice import Service service = Service('http://synbiomine.org/query/service') query = service.new_query() query.add_view('Gene.symbol', 'Gene.name') for row in query.results(): print row
def get_chromosomal_coordinates_as_FASTA(chr_id, region_str, use_colon = False, extension_for_saving = extension_for_saving, return_text = False): ''' Main function of script. Takes a chromosome designation and coordinates and gets from YeastMine the sequence of that region of the chromsome as FASTA format. Saves or returns the genomic sequence of the gene in FASTA format. The coordinate order is used to signal which strand to get. Coordinates in ascending order for the Watson strand and descending order for the Crick strand as is the convention at https://www.yeastgenome.org/seqTools under 'Search a specified chromosomal region of S288C genome'. Use `return_text` if calling from IPython or a Jupyter notebook and you want the FASTA record returned as text, ''' # Parse the region_str to get the start and end positions of the reference # sequence to specify what corresponding segment to extract from each of # the aligned sequences. Handle strand to get be provided via order. #--------------------------------------------------------------------------- if use_colon: coordinates_delimiter= ":" else: coordinates_delimiter = coordinates_delimiter_default region_str_parts = region_str.split(coordinates_delimiter) start, end = int(region_str_parts[0]), int(region_str_parts[1]) # just fix if user was knowledgeable about Python and used zero to get to # start because below I try to account for users using common numbering and # it will substract and woould give negative numbers. if start == 0: start = 1 if end == 0: end = 1 # sanity checks assert start != end, ( "The user-supplied 'start' ({}) and 'end' ({}) cannot be same value" ".".format(start,end)) '''CANNOT USE HERE BECAUSE START CAN BE LARGER TO SIGNAL STRAND assert start < end, ( "The user-supplied 'start' ({}) must be less than " "'end' ({}).".format(start,end)) ''' # overly explicit strand handling if start < end: get_watson_strand = True get_crick_strand = False else: get_watson_strand= False get_crick_strand= True # translate the strand info to YeastMine specifications if get_watson_strand: strand = 1 strand_text = "Watson(1)" sys.stderr.write("Sequence on Watson strand specified...") elif get_crick_strand: strand = -1 strand_text = "Crick(-1)" sys.stderr.write("Sequence on Crick strand specified...") else: sys.stderr.write("\n\nWhich strand?\n") sys.exit(1) # Get chromosome information from YeastMine #--------------------------------------------------------------------------- # Based on the query I built at YeastMine to get sequence of chromosome and # then can limit to coordinates needed after have entire sequence. service = Service("https://yeastmine.yeastgenome.org:443/yeastmine/service") # Get a new query on the class (table) you will be querying: query = service.new_query("Chromosome") # The view specifies the output columns query.add_view("sequence.residues") # constraint values chr_designation = "chr"+chr_id query.add_constraint("primaryIdentifier", "=", chr_designation, code = "A") rows = query.rows() results = [] for row in rows: results.append(row) # store corresponding genomic sequence genomic_seq = ( results[0]["sequence.residues"][min(start,end)-1: max(start,end)]) # the #minus one is so user can provide coordinates in common terms but this # adjusts for zero-indexing. # Make reverse complement if want crick strand BELOW after convert to # a biopython seq object so can use biopython `.reverse_complement` method # format chr_info for making output file name or anything else needing # that information chr_info = {} chr_info['chr_nom'] = chr_designation chr_info['start'] = start chr_info['end'] = end #print (gene_nom_info['aliases'] ) # FOR DEBUGGING ONLY #print (gene_nom_info['std_nom'] ) # FOR DEBUGGING ONLY #print (gene_nom_info['sys_nom'] ) # FOR DEBUGGING ONLY # feedback sys.stderr.write("retrieving sequence from chromosome " "{}...".format(chr_id)) # Make output FASTA record #--------------------------------------------------------------------------- # based on handling worked out in # `delete_seq_following_pattern_within_multiFASTA.py` # Description line loosely based on output from # https://www.yeastgenome.org/seqTools under 'Search a specified chromosomal # region of S288C genome'. record_description = 'coordinates {} to {}; strand is {}'.format( start, end, strand_text) record = SeqRecord(Seq(genomic_seq, generic_dna), id=chr_designation, description=record_description)#based # on https://www.biostars.org/p/48797/ and `.ungap()` method, see # https://github.com/biopython/biopython/issues/1511 , and `description` # from what I've seen for `id` plus https://biopython.org/wiki/SeqIO #print (records[indx]) # ONLY FOR DEBUGGING # Make reverse complement if want crick strand after convert to # a biopython seq object so can use biopython `.reverse_complement` method if get_crick_strand: record = record.reverse_complement(id=True,description=True) sys.stderr.write("making FASTA formatted entry with retrieved sequence...") # Return text if called with `return_text = True`. Otherwise, consider # called from command line & save file. #--------------------------------------------------------------------------- if return_text == True: # based on section 4.6 at #http://biopython.org/DIST/docs/tutorial/Tutorial.html#sec:SeqRecord-format # Feedback sys.stderr.write("\nReturning genomic sequence in FASTA format.") return record.format("fasta") else: output_file_name = generate_output_file_name( chr_info,extension_for_saving) SeqIO.write(record,output_file_name, "fasta"); # Feedback sys.stderr.write("\n\nFile of genomic sequence " "saved as '{}'.".format(output_file_name)) sys.stderr.write("\nFinished.\n")
# Retrieve hierarchy from wormmine # This version retrieves just a single child # Requires intermine installed: $ easy_install intermine # cf. http://intermine.wormbase.org/tools/wormmine/query.do for query construction #------------------------------------------------------------------------------- # USAGE: python RetrieveHierarchy.py > hierarchy.csv #------------------------------------------------------------------------------- # Get intermine service from intermine.webservice import Service service = Service("http://intermine.wormbase.org/tools/wormmine/service") # Get a new query on the class (table) you will be querying: query = service.new_query("AnatomyTerm") # Specify the output columns: query.add_view("name","synonym","primaryIdentifier","children.name", \ "children.primaryIdentifier","children.synonym") # Specify a custom sort order?: # query.add_sort_order("AnatomyTerm.name", "ASC") #------------------------------------------------------------------------------- # Just print names and IDs: for row in query.rows(): print '{0}|{1}|{2}|{3}'.format(row["name"],row["primaryIdentifier"], \ row["children.name"],row["children.primaryIdentifier"]) # Names, synonyms, and IDs: # for row in query.rows(): # print row["name"],"(",row["synonym"],"): ",row["primaryIdentifier"],",",row["children.name"], \
# This is an automatically generated script to run your query # to use it you will require the intermine python client. # To install the client, run the following command from a terminal: # # sudo easy_install intermine # # For further documentation you can visit: # http://intermine.readthedocs.org/en/latest/web-services/ # The following two lines will be needed in every python script: from intermine.webservice import Service service = Service("http://yeastmine.yeastgenome.org/yeastmine/service") # Get a new query on the class (table) you will be querying: query = service.new_query("Protein") # The view specifies the output columns query.add_view( "genes.primaryIdentifier", "genes.secondaryIdentifier", "symbol", "length", "molecularWeight", "pI", "genes.featureType", "genes.sgdAlias", "genes.description", "sequence.residues" ) # You can edit the constraint values below query.add_constraint("genes.featureType", "=", "intein_encoding_region", code = "H") query.add_constraint("genes.featureType", "=", "blocked_reading_frame", code = "E") query.add_constraint("genes.qualifier", "!=", "Dubious", code = "B") query.add_constraint("genes.qualifier", "IS NULL", code = "C") query.add_constraint("genes.status", "=", "Active", code = "D") query.add_constraint("genes.featureType", "=", "ORF", code = "F")
else: # To run your query # to use it you will require the intermine python client. # To install the client, run the following command from a terminal: # # sudo easy_install intermine # # For further documentation you can visit: # http://intermine.readthedocs.org/en/latest/web-services/ # The following two lines will be needed in every python script: from intermine.webservice import Service service = Service("https://yeastmine.yeastgenome.org:443/yeastmine/service") #seems current as of January 2018 from the YeastMine site example, I had also prior to this change in the script, run on my machine ` sudo easy_install intermine --upgrade` query = service.new_query("SequenceFeature") query.add_view( "primaryIdentifier", "featureType", "secondaryIdentifier", "description", "sgdAlias", "symbol" ) query.add_constraint("featureType", "=", "telomerase_RNA_gene", code = "Z") query.add_constraint("qualifier", "IS NULL", code = "W") query.add_constraint("qualifier", "!=", "Dubious", code = "V") query.add_constraint("status", "=", "Active", code = "U") query.add_constraint("featureType", "=", "transposable_element_gene", code = "S") query.add_constraint("featureType", "=", "telomeric_repeat", code = "R") query.add_constraint("featureType", "=", "telomere", code = "Q") query.add_constraint("featureType", "=", "tRNA_gene", code = "P") query.add_constraint("featureType", "=", "snoRNA_gene", code = "O") query.add_constraint("featureType", "=", "snRNA_gene", code = "N") query.add_constraint("featureType", "=", "LTR_retrotransposon", code = "M")
def fetch_yeast_locus_sequence(locus_name, flanking_size=0): """Acquire a sequence from SGD http://www.yeastgenome.org. :param locus_name: Common name or systematic name for the locus (e.g. ACT1 or YFL039C). :type locus_name: str :param flanking_size: The length of flanking DNA (on each side) to return :type flanking_size: int """ service = Service("http://yeastmine.yeastgenome.org/yeastmine/service") # Get a new query on the class (table) you will be querying: query = service.new_query("Gene") if flanking_size > 0: # The view specifies the output columns # secondaryIdentifier: the systematic name (e.g. YFL039C) # symbol: short name (e.g. ACT1) # length: sequence length # flankingRegions.direction: Upstream or downstream (or both) of locus # flankingRegions.sequence.length: length of the flanking regions # flankingRegions.sequence.residues: sequence of the flanking regions query.add_view("secondaryIdentifier", "symbol", "length", "flankingRegions.direction", "flankingRegions.sequence.length", "flankingRegions.sequence.residues") # You can edit the constraint values below query.add_constraint("flankingRegions.direction", "=", "both", code="A") query.add_constraint("Gene", "LOOKUP", locus_name, "S. cerevisiae", code="B") query.add_constraint("flankingRegions.distance", "=", "{:.1f}kb".format(flanking_size / 1000.), code="C") # Uncomment and edit the code below to specify your own custom logic: query.set_logic("A and B and C") # TODO: What to do when there"s more than one result? first_result = query.rows().next() # FIXME: Use logger module instead # print first_result["secondaryIdentifier"] # print first_result["symbol"], row["length"] # print first_result["flankingRegions.direction"] # print first_result["flankingRegions.sequence.length"] # print first_result["flankingRegions.sequence.residues"] seq = coral.DNA(first_result["flankingRegions.sequence.residues"]) # TODO: add more metadata elif flanking_size == 0: # The view specifies the output columns query.add_view("primaryIdentifier", "secondaryIdentifier", "symbol", "name", "sgdAlias", "organism.shortName", "sequence.length", "sequence.residues", "description", "qualifier") query.add_constraint("status", "IS NULL", code="D") query.add_constraint("status", "=", "Active", code="C") query.add_constraint("qualifier", "IS NULL", code="B") query.add_constraint("qualifier", "!=", "Dubious", code="A") query.add_constraint("Gene", "LOOKUP", locus_name, "S. cerevisiae", code="E") # Your custom constraint logic is specified with the code below: query.set_logic("(A or B) and (C or D) and E") first_result = query.rows().next() seq = coral.DNA(first_result["sequence.residues"]) else: print "Problem with the flanking region size...." seq = coral.DNA("") return seq
def find_forkhead(chrom_ident, pattern): from intermine.webservice import Service service = Service("http://yeastmine.yeastgenome.org/yeastmine/service") # Get a new query on the class (table) you will be querying: query = service.new_query("Chromosome") # The view specifies the output columns query.add_view("primaryIdentifier", "sequence.residues") # Uncomment and edit the line below (the default) to select a custom sort order: # query.add_sort_order("Chromosome.primaryIdentifier", "ASC") chromosome = '' for row in query.rows(): if row["primaryIdentifier"] == chrom_ident: chromosome = row["sequence.residues"] def rev_comp(dna): comp = '' for nucl in dna: if nucl == 'A': comp = comp + 'T' elif nucl == 'T': comp = comp + 'A' elif nucl == 'C': comp = comp + 'G' elif nucl == 'G': comp = comp + 'C' else: print('Not a DNA sequence') rev_comp = comp[::-1] return rev_comp chromosome_rev = rev_comp(chromosome) watson_finds = [] crick_finds = [] def find_pattern(pattern,seq): import regex find = regex.findall(pattern, seq) return find watson_finds = find_pattern(pattern, chromosome) crick_finds = find_pattern(pattern, chromosome_rev) #acs = '([ATC][ATC][AT][AT]TTTA[TC][AG]TTT[AT]GTT){e<=1}' acs = '(AAC[TA]AAA[CT][GA]TAAA[AT][AT][GAT][GAT]){e<=1}' #acs = 'TTATATGTTTT' #acs = 'AAAACATATAA' import regex potential_origins = [] for a in watson_finds: filtered_watson = [] a = str(a) filtered_watson = regex.findall(acs, a) if len(filtered_watson) > 0: potential_origins.append(str(a)) for a in crick_finds: filtered_crick = [] a = str(a) filtered_crick = regex.findall(acs, a) if len(filtered_crick) > 0: potential_origins.append(a) print(potential_origins) motif_start_pos = [] motif_end_pos = [] motif_seq = [] def find_motif(motif,seq): for a in range(0, len(seq)): slice = seq[a:a+len(motif)] if slice == motif: motif_start_pos.append(a+1) #+1 because first nuc is 1 not 0 motif_end_pos.append((a+1) + len(motif)) motif_seq.append(seq[a:a+len(motif)]) for a in potential_origins: find_motif(a,chromosome) for a in potential_origins: a = rev_comp(a) find_motif(a,chromosome) count = 0 with open('fkh_motifs_near_acs.csv', 'a') as f: for a in range(0, len(motif_start_pos)): match_pattern = "Fkh pattern: {},".format(pattern) chrom_location = "{}, {}, {},".format(chrom_ident, str(motif_start_pos[a]), str(motif_end_pos[a])) match_sequence = "{}\n".format(str(motif_seq[a])) f.write(match_pattern) f.write(chrom_location) f.write(match_sequence)
from intermine.webservice import Service from ete3 import NCBITaxa ncbi = NCBITaxa() #ncbi.update_taxonomy_database() service = Service("https://phytozome.jgi.doe.gov/phytomine/service") query = service.new_query("Organism") query.add_view("annotationVersion", "assemblyVersion", "commonName", "genus", "name", "proteomeId", "shortName", "species", "taxonId", "version") k = [ "proteomeId", "commonName", "name", "shortName", "annotationVersion", "assemblyVersion", "genus", "species", "taxonId", "version" ] t = [ "superkingdom", "kingdom", "phylum", "class", "subclass", "order", "family", "genus", "species" ] print("\t".join(k + t + ["full_lineage"])) def filterRanks(L): subset = {ncbi.get_rank([x])[x]: x for x in L} #return([if x in subset: ncbi.get_taxid_translator([x])[x] else: "NA" for x in t]) return ([ list(ncbi.get_taxid_translator([subset[x]]).values())[0] if x in subset else 'NA' for x in t ])
# This is an automatically generated script to run your query # to use it you will require the intermine python client. # To install the client, run the following command from a terminal: # # sudo easy_install intermine # # For further documentation you can visit: # http://www.intermine.org/wiki/PythonClient # The following two lines will be needed in every python script: from intermine.webservice import Service service = Service("http://www.mousemine.org/mousemine/service") # Get a new query on the class (table) you will be querying: query = service.new_query("OntologyTerm") # Type constraints should come early - before all mentions of the paths they constrain query.add_constraint("ontologyAnnotations.subject", "Genotype") # The view specifies the output columns query.add_view( "identifier", "name", "namespace", "ontologyAnnotations.subject.primaryIdentifier", "ontologyAnnotations.subject.name", "ontologyAnnotations.qualifier", "ontologyAnnotations.evidence.code.code", "ontologyAnnotations.evidence.publications.mgiJnum" ) # Uncomment and edit the line below (the default) to select a custom sort order: # query.add_sort_order("OntologyTerm.identifier", "ASC")
from intermine.webservice import Service service = Service("http://yeastmine.yeastgenome.org/yeastmine/service", token = "YOUR-API-KEY") query = service.new_query("Gene") query.add_view( "primaryIdentifier", "secondaryIdentifier", "organism.shortName", "symbol", "name" ) query.add_constraint("Gene", "IN", "systematic gene names", code = "A") for row in query.rows(): print(row["primaryIdentifier"], row["secondaryIdentifier"], row["organism.shortName"], \ row["symbol"], row["name"])
def find_forkhead(chrom_ident, pattern): from intermine.webservice import Service service = Service("http://yeastmine.yeastgenome.org/yeastmine/service") # Get a new query on the class (table) you will be querying: query = service.new_query("Chromosome") # The view specifies the output columns query.add_view("primaryIdentifier", "sequence.residues") # Uncomment and edit the line below (the default) to select a custom sort order: # query.add_sort_order("Chromosome.primaryIdentifier", "ASC") #chrIII = 'GATTACAGGGAATTTGTTTAATAGCAATTTATACGCTTTGTTATCGGCACCACCAAATTCTGGGATAACCGTTAATTCTTCCTCAGGTTTGCCTAGTGGATCCTCTCCTTCTGGAGTTTGGCCACGCTCTGGCTTTTCGATCAGACTTGGCATGTGACTAATCAAGTATGGCATGCTGGTTTTTGGGTCCTTTGTTTTCGTTGTTTCAGTCTGGATAAATTTTAAGTTACCATTATCGAAGGCACTTTTGTACTTGTCACTAATTAAAGATGCAATGTCAGCGGGGATACTCATTTTTATTTTAATTTTTACTTTTCTGTTTGTTCTAAAATCTATCTAAACTGGCTTTCAAGATCAATCTATTGTCTTTTAAGGTAAACTTTAAATTGGAAATAATAGTAATGTTAGTTCCTTCATTTTAACCTTGTATTGTATTTCCTTTGCGTGATGAAAAAAAAACTGAAAAAGAGAAAAATAAGAAAATCTTCTAGAACGTTCCGAAACAGGACACTTAGCACACAAATACAGAATAGGAAAGTAAAAGGCAATATATGAATGCAGTGCTTGTAACTGGTGCTTGTATCCAAGAATAGCTTCTTGCTGTAGGTTATGGGAATATCGTGTAAGCTGGGGTGACTTTTGAGCTATTCGCGACGCCCGACGCCGTAATAACTACTTTCGACAGACCACTTATGACAGTATTTCAGGCCGCTCTTATAAAATGACATGTTAACAAACAGTTCTGATTATTCGCCTTTTGACAGGACGATAATGTAAATAGTTGTGGTAGTATCATTCAGGTATGTAACTGTTTACTTTGTATCGCTTGAAAAAAATAAGCATTTCAGAGCCTTCTTTGGAGCTCAAGTGGATTGAGGCCACAGCAAGACCGGCCAGTTTGAATGCTCAACTCTTCAAAAGAAATTCCTCAAATATGTCCAGTTTCATGTACTGTCCGGTGTGATTTATTATTTTTTATTTACTTTGTAGTTCTTAAAGCTAAGATTTTTTTCTTTGATAAATTCTTGTTTTCATATCCTAAAATTAAAGGGAAAATAAACAATACATAACAAAACATATAAAAACCAACACAATAAAAAAAAGGATCAAATACTCATTAAAGTAACTTACACGGGGGCTAAAAACGGAGTTTGATGAATATTCACAAGATAAAAATCATATGTATGTTTCTGATATATCGATATACAATCAAACACTTTCAAGAATTTGTTTGTAGACTTTTTGCTAGAGACCTCATCAAAGTGCTACCAACTAAGATCAACTTATACTTCTTTTAGAGAAAATTTTTTTCAATGTACTCCAAAGAGATTTAGATCCTGTCTCTTCCTCTTCCTCTTCCTCGAAAGTCAAAGAAAAATCAGAGTCTCCCTGCTTATTCAGGCGGAGAGGCTCTAGGGTAGTTGCGTTTCTCTCATTGGGACACTGAACCTCATTTTCCAACATTTTGGTCATGTAAGAGGCGACAGGCTCATCGCAGGTGGGTGCATCAACATGGTAGTACCTGGACCAAGCGCTACATTGAGTCCCTCCTGGATAAACACCGCTACAATATTGTCTTTGGACGTTTGCCCAAACCATATCTTTTGAATACCAAAGCTGGACCACATTGTATGGCCTAATCATTGGTGCTACCATAATACTGGATTGGGAAACAGTCTGGTTAATTTTTTTCAACCAATTTTTCTTATCTAGCAATGATTTAATAAACCTGAAATCTAAATTGTCTTCGTTAGCGTCTGTGTCATAATCTACAATTGAGTACTGTGACGTCCAATTATATGGCACCGAGATGGGGAATCTGTCCGGTGTTTCGTCGCTGTTATCCTTCTCCTCCCTCCAAATGCAGTCAGAGGCAGGTGCCCATTCGGTTCGCCAGTCTCCGTTATTTACTACTTGGTACTGTTCCCAATCGTAATACGTTTCCTCTGGGTTGAAGATACTTGCTCTGCTCTTGACATTGCCCATAGCCACACCACGAGAAACATCGTGGAAGATTACGGAGCTGTTTACGATAGCAGGAGCAATGGATTTGACGAATGACACTTGATAAAAGTCTTTGGTCGAAAA' chromosome = '' for row in query.rows(): if row["primaryIdentifier"] == chrom_ident: chromosome = row["sequence.residues"] def rev_comp(dna): comp = '' for nucl in dna: if nucl == 'A': comp = comp + 'T' elif nucl == 'T': comp = comp + 'A' elif nucl == 'C': comp = comp + 'G' elif nucl == 'G': comp = comp + 'C' else: print 'Not a DNA sequence' rev_comp = comp[::-1] return rev_comp chromosome_rev = rev_comp(chromosome) #seq = 'AAACAGGACACTTAGCACACAAATACAGAATAGGAAAGTAAAAGGCAATATATGAATGCAGTGCTTGTAACTGGTGCTTGTATCCAAGAATAGCTTCTTGCTGTAGGTTATGGGAATATCGTGTAAGCTGGGGTGACTTTTGAGCTATTCGCGACGCCCGACGCCGTAATAACTACTTTCGACAGACCACTTATGACAGTATTTCAGGCCGCTCTTATAAAATGACATGTTAACAAACAGTTCTGATTATTCGCCTTTTGACAGGACGATAATGTAAATAGTTGTGGTAGTATCATTCAGGTATGTAACTGTTTACTTTGTATCGCTTGAAAAAAATAAGCATTTCAGAGCCTTCTTTGGAGCTCAAGTGGATTGAGGCCACAGCAAGACCGGCCAGTTTGAATGCTCAACTCTTCAAAAGAAATTCCTCAAATATGTCCAGTTTCATGTACTGTCCGGTGTGATTTATTATTTTTTATTTACTTTGTAGTTCTTAAAGCTAAGATTTTTTTCTTTGATAAATTCTTGTTTTCATATCCTAAAATTAAAGGGAAAATAAACAATACATAACAAAACATATAAAAACCAACACAATAAAAAAAAGGATCAAATACTCATTAAAGTAACTTACACGGGGGCTAAAAACGGAGTTTGATGAATATTCACAAGATAAAAATCATATGTATGTTTCTGATATATCGATATACAATCAAACACTTTCAAGAATTTGTTTGTAGACTTTTTGCTAGAGACCTCATCAAAGTGCTACCAACTAAGATCAACTTATACTTCTTTTAGAGAAAATTTTTTTCAATGTACTCCAAAGAGATTTAGATCCTGTCTCTTCCTCTTCCTCTTCCTCGAAAGTCAAAGAAAAATCAGAGTCTCCCTGCTTATTCAGGCGGAGAGGCTCTAGGGTAGTTGCGTTTCTCTCATTGGGACACTGAACCTCATTTTCCAACATTTTGGTCATGTAAGAGGCGACAGGCTCATCGCAGGTGGGTGCATCAACATGGTAGTACCTGGACCAAGCGCTACATTGAGTCCCTCCTGGATAAACACCGCTACAATATTGTCTTTGGACGTTT' #seq_rev = rev_comp(seq) watson_finds = [] crick_finds = [] def find_pattern(pattern,seq): import regex find = regex.findall(pattern, seq) return find watson_finds = find_pattern(pattern, chromosome) crick_finds = find_pattern(pattern, chromosome_rev) #WWWWTTTAYRTTTWGTT #acs = '([ATC][ATC][AT][AT]TTTA[TC][AG]TTT[AT]GTT){e<=1}' acs = '(AAC[TA]AAA[CT][GA]TAAA[AT][AT][GAT][GAT]){e<=1}' #acs = 'TTATATGTTTT' #acs = 'AAAACATATAA' import regex potential_origins = [] for a in watson_finds: filtered_watson = [] a = str(a) filtered_watson = regex.findall(acs, a) if len(filtered_watson) > 0: potential_origins.append(str(a)) for a in crick_finds: filtered_crick = [] a = str(a) filtered_crick = regex.findall(acs, a) if len(filtered_crick) > 0: potential_origins.append(a) print potential_origins motif_start_pos = [] motif_end_pos = [] motif_seq = [] def find_motif(motif,seq): for a in range(0, len(seq)): slice = seq[a:a+len(motif)] if slice == motif: motif_start_pos.append(a+1) #+1 because first nuc is 1 not 0 motif_end_pos.append((a+1) + len(motif)) motif_seq.append(seq[a:a+len(motif)]) for a in potential_origins: find_motif(a,chromosome) for a in potential_origins: a = rev_comp(a) find_motif(a,chromosome) count = 0 f1 = open('Out_table_1.txt', 'a') for a in range(0, len(motif_start_pos)): f1.write(chrom_ident) f1.write(' ') motif_midpoint = (motif_start_pos[a] + motif_end_pos[a]) / 2 f1.write(str(motif_midpoint)) f1.write(' ') f1.write(str(motif_seq[a])) f1.write('\n')