예제 #1
0
def load_sequences_from_uniprot(proteins, clean_seqid=None, cache_basename=None):
  if clean_seqid:
    change_seqids_in_proteins(proteins, clean_seqid)
  seqids = []
  for seqid in proteins:
    seqids.append(seqid)
    if 'other_seqids' in proteins[seqid]['attr']:
      seqids.extend(proteins[seqid]['attr']['other_seqids'])
  uniprot_data = uniprot.get_metadata_with_some_seqid_conversions(seqids, cache_basename)
  load_fastas_into_proteins(proteins, uniprot_data)
  if cache_basename:
    uniprot.write_fasta(cache_basename+'.fasta', uniprot_data, uniprot_data.keys())
예제 #2
0
filename = '/data/cycd_targets/cycd_target_uniprot.txt'
targetIDs = pd.read_csv(filename)
already_seen = pd.concat((targetIDs['Entry'], entries))

# Load hit list from PSSM
filename = '/data/cycd_targets/hsap_proteome/hsap_hits>20.csv'
targetIDs = pd.read_csv(filename, sep='\t')
entries = targetIDs['Entry']

# Do a merge to see what's not already seen in the hand-curated list
merged = set(entries) - set(already_seen)

# Fetch and write as FASTA
out_name = '/data/cycd_targets/hsap_hits>20.fasta'
upData = uniprot.batch_uniprot_metadata(merged, 'cache')
uniprot.write_fasta(out_name, upData, merged)

split_fastas(out_name)

PSIPRED_DIR = '/data/cycd_targets/cycd_target_uniprot_individuals'
seqs = []

for filename in os.listdir(PSIPRED_DIR):
    if filename.endswith('.ss2'):
        print 'Working on ', filename

        #Load PSIPRED VFORMAT in a sane way to extract only relevant info
        df = pd.read_csv(os.path.join(PSIPRED_DIR, filename),
                         header=0,
                         delim_whitespace=True,
                         skiprows=0,
예제 #3
0
mapping = uniprot.sequentially_convert_to_uniprot_id(seqids, "cache.json")

uniprot_seqids = mapping.values()


# Example 4 - get UniProt metadata

uniprot_data = uniprot.batch_uniprot_metadata(uniprot_seqids, "cache2.txt")

pprint.pprint(uniprot_data, indent=2)

for l in open("cache2.txt"):
    print l.strip()

uniprot.write_fasta("example.output.fasta", uniprot_data, uniprot_seqids)


# Example 5 - chaining commands to make your own
# special mapper


def map_to_refseq(seqids):
    uniprot_mapping = uniprot.sequentially_convert_to_uniprot_id(seqids, "func.cache.json")
    uniprot_ids = uniprot_mapping.values()
    pairs = uniprot.batch_uniprot_id_mapping_pairs("ACC", "P_REFSEQ_AC", uniprot_ids)
    mapping = {}
    for seqid in seqids:
        if seqid in uniprot_mapping:
            uniprot_id = uniprot_mapping[seqid]
        for pair in pairs:
예제 #4
0
    seqids, 'cache.json')

uniprot_seqids = mapping.values()


# Example 4 - get UniProt metadata

uniprot_data = uniprot.batch_uniprot_metadata(
    uniprot_seqids, 'cache2.txt')

pprint.pprint(uniprot_data, indent=2)

for l in open('cache2.txt'):
  print l.strip()

uniprot.write_fasta('example.output.fasta', uniprot_data, uniprot_seqids)


# Example 5 - chaining commands to make your own
# special mapper


def map_to_refseq(seqids):
  uniprot_mapping = uniprot.sequentially_convert_to_uniprot_id(
      seqids, 'func.cache.json')
  uniprot_ids = uniprot_mapping.values()
  pairs = uniprot.batch_uniprot_id_mapping_pairs(
    'ACC', 'P_REFSEQ_AC', uniprot_ids)
  mapping = {}  
  for seqid in seqids:
    if seqid in uniprot_mapping: