예제 #1
0
def main():
    # Get the password for webuser
    webuser_password = get_credentials('webuser')

    # Connect to the database as the webuser
    connection = psycopg2.connect(
        "dbname='%s' user='******' host='db' password='******'" %
        ('pfacts003_test', 'webuser', webuser_password))

    # Make a cursor, through which the database will give us results
    cur = connection.cursor(cursor_factory=psycopg2.extras.DictCursor)

    # Construct an SQL query string
    sql = """ SELECT tree.family_id, tree_node_name.name
              FROM family, tree, tree_node, tree_node_name
             WHERE tree.id = family.canonical_tree_id
               AND tree_node.tree_id = tree.id
               AND tree_node.left_id = 1
               AND tree_node_name.tree_node_id = tree_node.id
               AND tree_node_name.name LIKE '%Cellulase%';
        """

    # Execute the query on the database server
    cur.execute(sql)
    # Loop through the results
    for row in cur:
        # Format and print each result.
        # Newer versions of psycopg2 allow the columns to be accessed by name, but
        # these are not currently installed.
        family_accession = 'bpg%07d' % int(row[0])
        description = '"%s"' % row[1]
        print '%s: %s' % (family_accession, description)
def connect_to_server(DB_NAME, USER):
    """
    Connects to postgres database and returns the cursor.
    """
    PWD = get_credentials(USER)
    conn = psycopg2.connect("dbname='%s' user='******' host='db1' password='******'" %
                            (DB_NAME, USER, PWD))
    return conn
def main():
    dir = '/clusterfs/ohana/external/genomes/QuestForOrthologs/Release5'
    os.chdir(dir)
    info_of_uniprot_accession = {}
    f = open("all_uniprot_accessions.txt")
    for line in f.readlines():
        taxon_id, accession = line.strip().split(',')
        info_of_uniprot_accession[accession] = {}
        info_of_uniprot_accession[accession]['taxon'] = taxon_id
    f.close()

    uniprot_accessions_of_uniprot_id = {}

    bpg_password = get_credentials('bpg_user')

    connection = psycopg2.connect(
        "dbname='%s' user='******' host='db' password='******'" %
        ('pfacts003_test', 'bpg_user', bpg_password))

    cur = connection.cursor('server_side_cursor',
                            cursor_factory=psycopg2.extras.DictCursor)

    sql = 'SELECT * FROM uniprot_dat_index'
    cur.execute(sql)
    for row in cur:
        if row[1] in info_of_uniprot_accession:
            accession = row[1]
            uniprot_id = int(row[2])
            info_of_uniprot_accession[accession]['uniprot_id'] \
                = uniprot_id
            if uniprot_id not in uniprot_accessions_of_uniprot_id:
                uniprot_accessions_of_uniprot_id[uniprot_id] = set()
            uniprot_accessions_of_uniprot_id[uniprot_id].add(accession)

    f = open("info_of_uniprot_accession.pkl", "w")
    cPickle.dump(info_of_uniprot_accession, f)
    f.close()
    f = open("uniprot_accessions_of_uniprot_id.pkl", "w")
    cPickle.dump(uniprot_accessions_of_uniprot_id, f)
    f.close()
예제 #4
0
    def _handle_conflict(self, gene_info, taxon, gene_id, status, gn_accession,
                         start_position, end_position, orientation):
        """Handle input file conflicts

        It is possible for the input file to have duplicate start and
        end position records.

        One such example is:
            taxid   GeneID     status    start   end   orientation
            9606   100500719      -      89453  90011      -
            9606   100500719   INFERRED    100    658      +
        
        The above is a pseduo-gene and one we will not import.

        Another example is:
            taxid     GeneID  status     start      end    orientation
             3702    5007813 REVIEWED  21445902  21447340      -
             3702    5007813 REVIEWED  21445902  21447340      -
        
        Although the above appears to be a duplicate record, there are
        fields that we are not importing. Showing the other field
        entries in the same row order shows how the entries are
        different:
       
        taxid   GeneID   RNA_nucl_accession RNA_nucl_gi  protein_access.
        3702    5007813  NM_001084269.2      186491663    NP_001077738.1 
        3702    5007813  NM_001124039.1      186491666    NP_001117511.1
      
        """
        # Determine if records are identical:
        if self.gene_id == gene_id and\
           self.status == status and\
           self.start_position == start_position and\
           self.end_position == end_position and\
           self.orientation == orientation:
            # For our concerns, these records are identical
            # so, it doesn't matter
            return

        # If we don't have the gene record, we don't care about this
        # record anyhow
        conn = psycopg2.connect('dbname=%s host=db user=%s password=%s' %
                                (DB_NAME, DB_USER, get_credentials(DB_USER)))

        cur = conn.cursor()
        cur.execute("select uniprot_id from uniprot_gene_id where geneid=%d" %
                    gene_id)
        uniprot = cur.fetchone()

        # If we do have the gene record, we need to take further steps
        if uniprot is not None:
            if self.gn_accession.startswith('NC_') and gn_accession.startswith(
                    'NW_'):
                # NC trumps NW
                return

            if self.gn_accession.startswith('NW_') and gn_accession.startswith(
                    'NC_'):
                # update with new and return
                self._overwrite(gene_info, taxon, gene_id, status,
                                gn_accession, start_position, end_position,
                                orientation)
                return

            print "Warning: Conflict in tax_on %s, gene %s. Start Position: %d fighting start position: %d" % (
                self.taxon, self.gene_id, start_position, self.start_position)
예제 #5
0
PHOG_COLLECTION_SQL
PHOG_DATA_SQL
FAMILY_TAXA_SQL
SEQUENCE_COUNT_SQL
PFAM_ACCESSION_FROM_NAME
FAMILY_PFAM_DOMAIN_NAME
'''
########## Connection to server
import psycopg2
import psycopg2.extras
from pfacts003.utils.credentials import get_credentials

#Database connection globals
DB_NAME = 'pfacts003_test'
USER = '******'
PWD = get_credentials(USER)


def connect_to_server():
    """
    Connects to postgres database and returns the cursor.
    """
    conn = psycopg2.connect("dbname='%s' user='******' host='db' password='******'" %
                            (DB_NAME, USER, PWD))
    cur = conn.cursor(cursor_factory=psycopg2.extras.DictCursor)
    return cur


def sql_results(cur, sql_query, parameter_tuple):
    """
    Run the sql query and return results as a list of dictionaries.
def main():
    # parse command line options
    usage = "%prog [options] fasta_file_to_cluster"
    opt_parser = OptionParser(usage=usage)
    opt_parser.add_option(
        "-s",
        "--candidate_seed_species",
        dest="species_str",
        default="",
        help="Comma-separated mnemonics of species which may be seeds")
    (options, args) = opt_parser.parse_args()
    if len(args) != 1:
        opt_parser.error('Incorrect number of arguments')
    fasta_file = args[0]
    if (not os.path.exists(fasta_file)):
        opt_parser.error('fasta file %s not found' % args[0])
    species_list = [
        species for species in options.species_str.upper().split(',')
        if species != ''
    ]
    if len(species_list) > 0:
        species_re = re.compile('|'.join(species_list))
        is_desired_species = (lambda record: match_species(species_re, record))
    else:
        is_desired_species = (lambda record: True)
    flock_password = get_credentials('flock_user')
    connection = pgdb.connect("db:flock_seeds:flock_user:%s" % flock_password)
    #  connection = pgdb.connect("db:flock_seeds::")
    cursor = connection.cursor()
    handle, random_path = tempfile.mkstemp()
    need_random_name = True
    while need_random_name:
        random_name = os.path.split(random_path)[1]
        os.close(handle)
        os.unlink(random_path)
        sql = """CREATE TABLE %s (
              sequence_key VARCHAR(255) PRIMARY KEY,
              length INTEGER,
              is_seed BOOLEAN,
              has_been_clustered BOOLEAN,
              random_integer INTEGER
              )""" % random_name
        try:
            cursor.execute(sql)
            connection.commit()
            need_random_name = False
        except pg.DatabaseError:
            pass
    print random_name
    sql = """CREATE INDEX %s_is_seed
            ON %s(is_seed)""" % (random_name, random_name)
    cursor.execute(sql)
    connection.commit()
    sql = """CREATE INDEX %s_has_been_clustered
            ON %s(is_seed)""" % (random_name, random_name)
    cursor.execute(sql)
    connection.commit()
    sql = """CREATE INDEX %s_random_integer
            ON %s(random_integer)""" % (random_name, random_name)
    cursor.execute(sql)
    connection.commit()
    num_inserted_records = 0
    num_records = 0
    f = open(fasta_file, "rU")
    seq_iterator = SeqIO.parse(f, "fasta")
    for record in seq_iterator:
        num_records += 1
        if is_desired_species(record):
            sql = "SELECT sequence_key FROM %s WHERE sequence_key = '%s'" \
                    % (random_name, sql_escape(record.id))
            cursor.execute(sql)
            connection.commit()
            if cursor.rowcount > 0:
                # There is a duplicate entry, which we do not expect, so report it
                print "Duplicate entry for sequence key %s" % sql_escape(
                    record.id)
            else:
                sql = """INSERT INTO %s
                  ( sequence_key,
                    length,
                    is_seed,
                    has_been_clustered,
                    random_integer
                  )
                  VALUES
                  ( '%s',
                    %d,
                    false,
                    false,
                    %d
                  )""" % (random_name, sql_escape(record.id), len(
                    record.seq), randint(1, 10000000))
                cursor.execute(sql)
                connection.commit()
                num_inserted_records += 1
    connection.commit()
    cursor.close()
    f.close()
    connection.close()
    print "Inserted %d of %d records" % (num_inserted_records, num_records)
예제 #7
0
from optparse import OptionParser
import subprocess
import sys

try:
    from pfacts003.utils.credentials import get_credentials
except ImportError:
    print """
    
    I couldn't import credentials. Are you sure you set up the
        environment? Your choices are production, staging and development.
    """
    sys.exit(1)

USER = '******'
password = get_credentials(USER)

if not password:
    print "Could not get password."
    sys.exit(1)


def create_submission(working_dir, dirname='schema_spy_output'):

    contents = """#!/bin/bash
#PBS -e %(working_dir)s/schema_spy_error.log
#PBS -o %(working_dir)s/schema_spy_output.log
#PBS -N schema_spy

# WARNING!
# The bpg password had been retrieved and is included below. This
def read_from_database(comm, num_tree_servers, num_uniprot_processors, 
                      all_uniprot_ids):
  base_tree_server_id = 1
  base_uniprot_processor_id = 1 + num_tree_servers
  tree_row_info = numpy.zeros(6, dtype='d')
  uniprot_row_info = numpy.zeros(3, dtype='i')

  bpg_password = get_credentials('bpg_user')
  connection = psycopg2.connect(
    "dbname='%s' user='******' host='db' password='******'" %
    ('pfacts003_test', 'bpg_user', bpg_password))
  cur = connection.cursor('ortholog_cursor',
                          cursor_factory = psycopg2.extras.DictCursor)

  db_row_fetch_time = 0.0
  tree_row_send_time = 0.0
  uniprot_row_send_time = 0.0
  row_prep_time = 0.0
  t1 = MPI.Wtime()
  sql = """SELECT tree_id,
                  tree_node_left_id,
                  tree_node_right_id,
                  duplication_distance,
                  greatest_duplication_distance_of_maximal_descendant,
                  uniprot_id
                  FROM tree_node_uniprot_taxonomy_materialized
                  """

  cur.execute(sql)
  num_database_rows = 0
  db_row_start_t = MPI.Wtime()
  for row in cur:
    db_row_end_t = MPI.Wtime()
    db_row_fetch_time += db_row_end_t - db_row_start_t
    cur_tree_row_send_time = 0.0
    cur_uniprot_row_send_time = 0.0
    num_database_rows += 1
    if num_database_rows > 0 and num_database_rows % 1000000 == 0:
      print "Read %d rows from database so far" % num_database_rows
    tree_id = row[0]
    greatest_duplication_distance_of_maximal_descendant = row[4]
    uniprot_id = row[5]
    if greatest_duplication_distance_of_maximal_descendant and \
          (not row[3] or row[3] > row[4]) or \
        uniprot_id is not None and uniprot_id in all_uniprot_ids:
      for i in range(6):
        if row[i] is not None:
          tree_row_info[i] = float(row[i])
        else:
          tree_row_info[i] = None
      if uniprot_id is not None and uniprot_id not in all_uniprot_ids:
        tree_row_info[5] = None
      tree_server_num = tree_id % num_tree_servers
      tree_row_send_start_t = MPI.Wtime()
      comm.Send([tree_row_info,MPI.DOUBLE_PRECISION], 
                dest=tree_server_num + base_tree_server_id,
                tag=TAG_DATABASE_ROW)
      tree_row_send_end_t = MPI.Wtime()
      cur_tree_row_send_time = tree_row_send_end_t - tree_row_send_start_t
      tree_row_send_time += cur_tree_row_send_time
      if not greatest_duplication_distance_of_maximal_descendant:
        uniprot_row_info[0] = int(row[0]) # tree_id
        uniprot_row_info[1] = int(row[1]) # tree_node_left_id
        uniprot_row_info[2] = int(row[5]) # uniprot_id
        uniprot_processor_num = uniprot_id % num_uniprot_processors
        uniprot_row_send_start_t = MPI.Wtime()
        comm.Send([uniprot_row_info,MPI.INT],
                  dest=uniprot_processor_num + base_uniprot_processor_id,
                  tag=TAG_DATABASE_ROW)
        uniprot_row_send_end_t = MPI.Wtime()
        cur_uniprot_row_send_time = uniprot_row_send_end_t \
                                    - uniprot_row_send_start_t
        uniprot_row_send_time += cur_uniprot_row_send_time
    db_row_start_t = MPI.Wtime()
    row_prep_time += (db_row_start_t - db_row_end_t) \
                  - cur_tree_row_send_time - cur_uniprot_row_send_time

  t2 = MPI.Wtime()
  print "Finished reading ", num_database_rows, 
  print " rows of the database in ", t2 - t1, " secs"
  print "Total time fetching database rows: ", db_row_fetch_time
  print "Total time sending tree rows: ", tree_row_send_time
  print "Total time sending uniprot rows: ", uniprot_row_send_time
  print "Total time preparing rows to send out: ", row_prep_time
  for tree_server_num in range(num_tree_servers):
    comm.Send([MPI.BOTTOM,MPI.INT], dest=tree_server_num + base_tree_server_id, 
              tag=TAG_DATABASE_DONE)

  for uniprot_processor_num in range(num_uniprot_processors):
    comm.Send([MPI.BOTTOM,MPI.INT], 
              dest=uniprot_processor_num + base_uniprot_processor_id,
              tag=TAG_DATABASE_DONE)