예제 #1
0
class Entrez:
    def __init__(self, download_path):
        self.online_version = None
        self.get_online_version()
        self.version = Version('Entrez',
                               version=self.online_version,
                               download_path=download_path)
        self.logged_version = self.version.last_logged_version()
        self.download_path = download_path
        # self.http = urllib3.PoolManager(cert_reqs='CERT_REQUIRED', ca_certs=certifi.where())

    def is_current(self):
        """Returns True if local versions of Entrez files are up-to-date."""
        return self.version.is_current()

    def get_online_version(self):
        # This assumes that if gene2accession needs updating, so will other Entrez files.
        r = requests.get('http://ftp.ncbi.nlm.nih.gov/gene/DATA/',
                         stream=True,
                         allow_redirects=True)
        bsObj = BeautifulSoup(r.text, "html.parser")
        for link in bsObj.find_all('a'):
            if link.get('href') == 'gene2accession.gz':
                self.online_version = datetime.datetime.strptime(
                    link.next.next.split()[0], '%d-%b-%Y').strftime('%d-%B-%Y')
                break

    def download_file(self, url, local_filename):
        # NOTE the stream=True parameter
        r = requests.get(url,
                         stream=True,
                         allow_redirects=True,
                         auth=HTTPBasicAuth(self.username, self.password))
        with open(local_filename, 'wb') as f:
            for chunk in r.iter_content(chunk_size=1024):
                if chunk:  # filter out keep-alive new chunks
                    f.write(chunk)

    def extract(self, file):
        with gzip.open(file, 'rb') as rf:
            with open(
                    os.path.join(self.download_path,
                                 file.rsplit('.', 1)[0] + '.human'),
                    'w') as wf:
                for i, line in enumerate(rf):
                    line_ascii = line.decode('utf-8')
                    if i == 0:
                        wf.write(line_ascii)
                    else:
                        species = line_ascii.split()[0]
                        if species == '9606':  # Grab human only
                            wf.write(line_ascii)

    def download_file(self, url, local_filename):
        # NOTE the stream=True parameter
        r = requests.get(url, stream=True, allow_redirects=True)
        with open(local_filename, 'wb') as f:
            for chunk in r.iter_content(chunk_size=1024):
                if chunk:  # filter out keep-alive new chunks
                    f.write(chunk)

    def download_files(self):
        """Download and extract the gene2accession and gene_info files"""
        print('Downloading Entrez Accessions...')
        print('gene2accession:')
        g2a_filename = os.path.join(self.download_path, 'gene2accession.gz')
        self.download_file(
            "http://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2accession.gz",
            g2a_filename)
        print('\ngene_info:')
        gi_filename = os.path.join(self.download_path, 'gene_info.gz')
        self.download_file(
            "http://ftp.ncbi.nlm.nih.gov/gene/DATA/gene_info.gz", gi_filename)
        print('\ninteractions:')
        ia_filename = os.path.join(self.download_path, 'interactions.gz')
        self.download_file(
            "http://ftp.ncbi.nlm.nih.gov/gene/GeneRIF/interactions.gz",
            ia_filename)
        print('\nExtracting Entrez Accessions...')
        print('gene_info...')
        self.extract(gi_filename)
        print('gene2accession...')
        self.extract(g2a_filename)
        print('interactions...')
        self.extract(ia_filename)
        os.remove(g2a_filename)
        os.remove(gi_filename)
        os.remove(ia_filename)

    def parse(self):
        self.rows = []
        with open(os.path.join(self.download_path, 'gene_info.human')) as f:
            fieldnames = [
                'tax_id', 'entrez_id', 'entrez_gene_symbol', 'locus_tag',
                'entrez_gene_synonyms', 'dbXrefs', 'chromosome', 'map_loc',
                'description', 'type', 'sym_from_auth', 'full_from_auth',
                'nom_status', 'other_designations', 'mod_date'
            ]
            reader = csv.DictReader(f, delimiter='\t', fieldnames=fieldnames)
            for i, row in enumerate(reader):
                if i == 0:
                    continue
                ensembl = set()
                for key, value in row.items():
                    if value == '-':
                        row[key] = 'N/A'
                dbXrefs = row['dbXrefs'].split('|')
                for xRef in dbXrefs:
                    if xRef == 'N/A':
                        continue
                    source, label = xRef.split(':', 1)
                    if source == 'Ensembl':
                        ensembl.add(label)
                row['ensembl_ids'] = '|'.join(ensembl) or 'N/A'
                self.rows.append(row)

    def write(self):
        filename = os.path.join(self.download_path, 'entrez_genes.tsv')
        with open(filename, 'w') as f:
            fieldnames = [
                'entrez_id', 'entrez_gene_symbol', 'entrez_gene_synonyms',
                'ensembl_ids', 'description'
            ]
            writer = csv.DictWriter(f,
                                    delimiter='\t',
                                    fieldnames=fieldnames,
                                    extrasaction='ignore')
            writer.writeheader()
            writer.writerows(self.rows)
        self.version.write_log()

    def update(self):
        if not self.is_current():
            self.download_files()
            self.parse()
            self.write()
예제 #2
0
class DrugBank(object):
    def __init__(self, username, password, download_path, tsv_file):
        self.online_version = None
        self.get_online_version()
        self.version = Version('DrugBank',
                               version=self.online_version,
                               download_path=download_path)
        self.logged_version = self.version.last_logged_version()
        self.interactions = self.drug_info = None
        self.username = username
        self.password = password
        self.download_path = download_path
        self.tsv_file = tsv_file

    def is_current(self):
        """Returns True if local versions of Entrez files are up-to-date."""
        return self.version.is_current()

    def get_online_version(self):
        print('Checking DrugBank Version...')
        context = ssl._create_unverified_context()
        html = requests.get('http://www.drugbank.ca/downloads')
        bsObj = BeautifulSoup(html.text, "html.parser")
        r = re.compile(r'Version ([\d\.]+)')
        match = r.search(bsObj.h1.text)
        if match:
            self.online_version = match.group(1)
        else:
            raise ValueError('Error loading online version.')

    def download_file(self, url, local_filename):
        # NOTE the stream=True parameter
        r = requests.get(url,
                         stream=True,
                         allow_redirects=True,
                         auth=HTTPBasicAuth(self.username, self.password))
        with open(local_filename, 'wb') as f:
            for chunk in r.iter_content(chunk_size=1024):
                if chunk:  # filter out keep-alive new chunks
                    f.write(chunk)

    def download_files(self):
        print('Downloading DrugBank XML...')
        filename = os.path.join(self.download_path, 'drugbank.zip')
        self.download_file(
            'https://www.drugbank.ca/releases/5-1-7/downloads/all-full-database',
            filename)

        print('\nExtracting DrugBank XML...')
        zfile = zipfile.ZipFile(filename)
        zfile.extract('full database.xml', self.download_path)
        os.remove(filename)
        e = Entrez(self.download_path)
        e.update()

    def parse(self):
        print('Parsing Entrez...')
        symbol_to_info = dict()
        hgnc_id_to_info = dict()
        entrez_to_info = dict()
        sources = set()
        with open(os.path.join(self.download_path, 'gene_info.human')) as f:
            c = csv.reader(f, delimiter='\t')
            for i, line in enumerate(c):
                if i == 0:
                    continue
                if line[0] != '9606':
                    continue
                gene_symbol = line[2]
                entrez_id = line[1]
                symbol_to_info[gene_symbol] = {
                    'Entrez': entrez_id,
                    'Symbol': gene_symbol
                }
                if line[5] == '-':
                    continue
                synonyms = line[5].split('|')
                for synonym in synonyms:
                    (source, accession) = synonym.split(':', 1)
                    symbol_to_info[gene_symbol][source] = accession
                    sources.add(source)
                if 'HGNC' in symbol_to_info[gene_symbol]:
                    hgnc_id_to_info[symbol_to_info[gene_symbol]
                                    ['HGNC']] = symbol_to_info[gene_symbol]
                entrez_to_info[entrez_id] = symbol_to_info[gene_symbol]

        uniprot_to_entrez = dict()
        r = re.compile(
            r'[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}'
        )
        # regex from: http://www.uniprot.org/help/accession_numbers
        with open(os.path.join(self.download_path,
                               'gene2accession.human')) as f:
            c = csv.reader(f, delimiter='\t')
            for i, line in enumerate(c):
                if i == 0:
                    continue
                if line[0] != '9606':
                    continue
                uniprot_id = line[5].split('.', 1)[0]
                if not r.match(uniprot_id):
                    continue
                entrez_id = line[1]
                uniprot_to_entrez[uniprot_id] = entrez_id

        print('Parsing DrugBank XML...')
        ns = {'entry': 'http://www.drugbank.ca'}

        tree = ET.parse(os.path.join(self.download_path, 'full database.xml'))
        drugbank = tree.getroot()
        drugs = drugbank.findall('entry:drug', ns)

        interactions = dict()
        drug_info = dict()
        uniprot_fail = uniprot_success = 0
        hgnc_fail = hgnc_success = 0
        no_info = info = no_ensembl = 0
        total = 0
        for drug in drugs:
            drug_id = drug.find('entry:drugbank-id', ns).text
            drug_name = drug.find('entry:name', ns).text

            synonyms = drug.find('entry:synonyms', ns)
            drug_synonyms = set()
            for synonym in synonyms:
                language = synonym.get('language')
                if language == '' or language == 'English':
                    drug_synonyms.add(synonym.text)
            external_identifiers = drug.find('entry:external-identifiers', ns)
            chembl_id = ''
            for external_identfier in external_identifiers:
                resource = external_identfier.find('entry:resource', ns).text
                if resource == 'ChEMBL':
                    chembl_id = external_identfier.find(
                        'entry:identifier', ns).text
            drug_cas_number = drug.find('entry:cas-number', ns).text
            drug_brands = set()
            for product in drug.find('entry:products', ns):
                drug_brands.add(product.find('entry:name', ns).text)
            for int_brand in drug.find('entry:international-brands', ns):
                drug_brands.add(int_brand.find('entry:name', ns).text)
            drug_type = drug.get('type')
            drug_groups = set()
            for group in drug.find('entry:groups', ns):
                drug_groups.add(group.text)
            drug_categories = set()
            for category in drug.find('entry:categories', ns):
                drug_categories.add(
                    category.find('entry:category', ns).text.lower())
            targets = drug.find('entry:targets', ns)
            if len(targets) == 0:
                continue
            drug_info[drug_id] = (drug_name, tuple(sorted(drug_synonyms)),
                                  drug_cas_number, tuple(sorted(drug_brands)),
                                  drug_type, tuple(sorted(drug_groups)),
                                  tuple(sorted(drug_categories)), chembl_id)
            for target in targets:
                organism = target.find('entry:organism', ns).text
                if organism != 'Humans':
                    continue
                gene_id = target.find('entry:id', ns).text
                known_action = target.find('entry:known-action', ns).text
                target_actions = set()
                for action in target.find('entry:actions', ns):
                    target_actions.add(action.text)
                gene_symbol = hgnc_gene_acc = uniprot_id = entrez_id = ensembl_id = None
                pmids = set()
                references = target.find('entry:references', ns)
                articles = references.find('entry:articles',
                                           ns).findall('entry:article', ns)
                for article in articles:
                    pmids.add(article.find('entry:pubmed-id', ns).text)
                pmids = tuple(pmids)
                polypeptide = target.find('entry:polypeptide', ns)
                synonyms = None
                if polypeptide is not None:
                    gene_symbol = polypeptide.find('entry:gene-name', ns).text
                    for identifier in polypeptide.find(
                            'entry:external-identifiers', ns):
                        if identifier.find(
                                'entry:resource', ns
                        ).text == 'HUGO Gene Nomenclature Committee (HGNC)':
                            hgnc_gene_acc = identifier.find(
                                'entry:identifier', ns).text
                            # Some identifiers are incorrectly labeled by DrugBank
                            r = re.compile(r'^\d+$')
                            if hgnc_gene_acc.startswith('GNC:'):
                                hgnc_gene_acc = 'H' + hgnc_gene_acc
                            elif r.match(hgnc_gene_acc):
                                hgnc_gene_acc = 'HGNC:' + hgnc_gene_acc
                            try:
                                synonyms = hgnc_id_to_info[hgnc_gene_acc]
                            except:
                                hgnc_fail += 1
                            else:
                                entrez_id = synonyms['Entrez']
                                try:
                                    ensembl_id = synonyms['Ensembl']
                                except KeyError:
                                    no_ensembl += 1
                                hgnc_success += 1
                        elif identifier.find('entry:resource',
                                             ns).text == 'UniProtKB':
                            uniprot_id = identifier.find(
                                'entry:identifier', ns).text
                            if not synonyms:
                                try:
                                    entrez_id = uniprot_to_entrez[uniprot_id]
                                    synonyms = entrez_to_info[entrez_id]
                                except KeyError:
                                    uniprot_fail += 1
                                else:
                                    uniprot_success += 1
                    if not synonyms:
                        try:
                            synonyms = symbol_to_info[gene_symbol]
                        except KeyError:
                            no_info += 1
                        else:
                            entrez_id = synonyms['Entrez']
                            ensembl_id = synonyms['Ensembl']
                            info += 1
                interaction_tuple = (gene_id, known_action,
                                     tuple(sorted(target_actions)),
                                     gene_symbol, uniprot_id, entrez_id,
                                     ensembl_id, pmids)
                total += 1
                try:
                    interactions[drug_id].append(interaction_tuple)
                except KeyError:
                    interactions[drug_id] = [
                        interaction_tuple,
                    ]
        self.interactions = interactions
        self.drug_info = drug_info

    def write(self):
        print('Writing to .tsv...')
        i = 0
        no_ensembl = no_entrez = total = 0
        header = ('count', 'drug_id', 'drug_name', 'drug_synonyms',
                  'drug_cas_number', 'drug_brands', 'drug_type', 'drug_groups',
                  'drug_categories', 'chembl_id', 'gene_id', 'known_action',
                  'target_actions', 'gene_symbol', 'uniprot_id', 'entrez_id',
                  'ensembl_id', 'pmid')
        with open(self.tsv_file, 'w') as f:
            writer = csv.writer(f, delimiter='\t')
            writer.writerow(header)
            for drug in sorted(self.interactions):
                for interaction in self.interactions[drug]:
                    i += 1
                    data = (i, drug) + self.drug_info[drug] + interaction
                    out = list()
                    for datum in data:
                        if isinstance(datum, tuple):
                            datum = ';'.join(str(x) for x in datum)
                        datum = str(datum).replace("\t", '')
                        if not datum or datum == 'None':
                            datum = 'N/A'
                        out.append(datum)
                        # Some small number of rows contain tabs within text.
                    if out[14] == 'N/A':
                        no_entrez += 1
                    if out[15] == 'N/A':
                        no_ensembl += 1
                    writer.writerow(out)
        with open('tmp/version', 'w') as version_file:
            version_file.write(self.version)

    def update(self):
        if not self.is_current():
            self.download_files()
            self.parse()
            self.write()
예제 #3
0
파일: get_go.py 프로젝트: acoffman/dgi-db
class GO:

    def __init__(self):
        # These were assembled from the original DGIdb identifiers
        self.dgidb_go_terms = [{"short_name": "Kinase", "full_go_name": "KinaseActivity",
             "human_readable": "Kinase", "go_id": "GO0016301"},
            {"short_name": "TyrosineKinase", "full_go_name": "ProteinTyrosineKinaseActivity",
             "human_readable": "Tyrosine Kinase", "go_id": "GO0004713"},
            {"short_name": "SerineThreonineKinase", "full_go_name": "ProteinSerineThreonineKinaseActivity",
             "human_readable": "Serine Threonine Kinase", "go_id": "GO0004674"},
            {"short_name": "ProteinPhosphatase", "full_go_name": "PhospoproteinPhosphataseActivity",
             "human_readable": "Protein Phosphatase", "go_id": "GO0004721"},
            {"short_name": "GProteinCoupledReceptor", "full_go_name": "GpcrActivity",
             "human_readable": "G Protein Coupled Receptor", "go_id": "GO0004930"},
            {"short_name": "NeutralZincMetallopeptidases", "full_go_name": "MetallopeptidaseActivity",
             "human_readable": "Neutral Zinc Metallopeptidase", "go_id": "GO0008237"},
            {"short_name": "ABCTransporter", "full_go_name": "ABCTransporterActivity",
             "human_readable": "ABC Transporter", "go_id": "GO0042626"},
            {"short_name": "RNADirectedDNAPolymerase", "full_go_name": "RNADirectedDnaPolymeraseActivity",
             "human_readable": "RNA Directed DNA Polymerase", "go_id": "GO0003964"},
            {"short_name": "Transporter", "full_go_name": "TransporterActivity",
             "human_readable": "Transporter", "go_id": "GO0005215"},
            {"short_name": "IonChannel", "full_go_name": "IonChannelActivity",
             "human_readable": "Ion Channel", "go_id": "GO0005216"},
            {"short_name": "NuclearHormoneReceptor", "full_go_name": "LigandDependentNuclearReceptorActivity",
             "human_readable": "Nuclear Hormone Receptor", "go_id": "GO0004879"},
            {"short_name": "LipidKinase", "full_go_name": "LipidKinaseActivity",
             "human_readable": "Lipid Kinase", "go_id": "GO0001727"},
            {"short_name": "Phospholipase", "full_go_name": "PhospholipaseActivity",
             "human_readable": "Phospholipase", "go_id": "GO0004620"},
            {"short_name": "ProteaseInhibitorActivity", "full_go_name": "PeptidaseInhibitorActivity",
             "human_readable": "Protease Inhibitor", "go_id": "GO0030414"},
            {"short_name": "DNARepair", "full_go_name": "DnaRepair",
             "human_readable": "DNA Repair", "go_id": "GO0006281"},
            {"short_name": "CellSurface", "full_go_name": "CellSurface",
             "human_readable": "Cell Surface", "go_id": "GO0009986"},
            {"short_name": "ExternalSideOfPlasmaMembrane", "full_go_name": "ExternalSideOfPlasmaMembrane",
             "human_readable": "External Side Of Plasma Membrane", "go_id": "GO0009897"},
            {"short_name": "GrowthFactor", "full_go_name": "GrowthFactorActivity",
             "human_readable": "Growth Factor", "go_id": "GO0008083"},
            {"short_name": "HormoneActivity", "full_go_name": "HormoneActivity",
             "human_readable": "Hormone Activity", "go_id": "GO0005179"},
            {"short_name": "TumorSuppressor", "full_go_name": "RegulationOfCellCycle",
             "human_readable": "Tumor Suppressor", "go_id": "GO0051726"},
            {"short_name": "TranscriptionFactorBinding", "full_go_name": "TranscriptionFactorBinding",
             "human_readable": "Transcription Factor Binding", "go_id": "GO0008134"},
            {"short_name": "TranscriptionFactorComplex", "full_go_name": "TranscriptionFactorComplex",
             "human_readable": "Transcription Factor Complex", "go_id": "GO0005667"},
            {"short_name": "HistoneModification", "full_go_name": "HistoneModification",
             "human_readable": "Histone Modification", "go_id": "GO0016570"},
            {"short_name": "DrugMetabolism", "full_go_name": "DrugMetabolism",
             "human_readable": "Drug Metabolism", "go_id": "GO0017144"},
            {"short_name": "DrugResistance", "full_go_name": "ResponseToDrug",
             "human_readable": "Drug Resistance", "go_id": "GO0042493"},
            {"short_name": "ProteaseActivity", "full_go_name": "PeptidaseActivity",
             "human_readable": "Protease", "go_id": "GO0008233"}]

        #TODO: Use http://www.ebi.ac.uk/QuickGO/GHistory#info=2 for versioning.
        self.version = Version('GO', append_date=True)
        self.logged_version = self.version.last_logged_version()

        self.rows = []

    def is_current(self):
        """Returns True if local versions of Entrez files are up-to-date."""
        return self.version.is_current()

    def download_files(self):
        """Download and extract the gene2accession and gene_info files"""
        go_ids = [':'.join((x['go_id'][:2], x['go_id'][2:])) for x in self.dgidb_go_terms]
        url = 'http://www.ebi.ac.uk/QuickGO/GAnnotation?format=tsv&limit=-1&gz=false&tax=9606&goid='
        os.makedirs('data/GO', exist_ok=True)
        for go_id in go_ids:
            file = 'data/GO/' + go_id.replace(':','') + '.tsv'
            try:
                os.remove(file)
            except FileNotFoundError:
                pass
            wget.download(url + go_id, out=file)

    def parse(self):
        self.rows = []
        go_ids = [':'.join((x['go_id'][:2], x['go_id'][2:])) for x in self.dgidb_go_terms]
        category_lookup = {x['go_id']: x['human_readable'].upper() for x in self.dgidb_go_terms}

        for go_id in go_ids:
            temp = {}
            category = category_lookup[go_id.replace(':', '')]
            file = 'data/GO/' + go_id.replace(':', '') + '.tsv'
            with open(file, 'r') as f:
                reader = csv.DictReader(f, delimiter='\t')
                for row in reader:
                    if row['Symbol'] == '-' or ' ' in row['Symbol']:
                        continue
                    try:
                        temp[row['Symbol']].add(row['ID'])
                    except KeyError:
                        temp[row['Symbol']] = set((row['ID'],))
            for symbol in temp:
                row = {'Symbol': symbol, 'Category': category, 'IDs': '|'.join(temp[symbol])}
                self.rows.append(row)

    def write(self):
        fieldnames = ['Symbol', 'IDs', 'Category']
        with open('data/go.human.tsv', 'w') as f:
            writer = csv.DictWriter(f, delimiter='\t', fieldnames=fieldnames, extrasaction='ignore')
            writer.writeheader()
            for row in self.rows:
                writer.writerow(row)
        self.version.write_log()

    def update(self):
        if not self.is_current():
            #self.download_files()
            self.parse()
            self.write()
예제 #4
0
class GO:
    def __init__(self):
        # These were assembled from the original DGIdb identifiers
        self.dgidb_go_terms = [{
            "short_name": "Kinase",
            "full_go_name": "KinaseActivity",
            "human_readable": "Kinase",
            "go_id": "GO0016301"
        }, {
            "short_name": "TyrosineKinase",
            "full_go_name": "ProteinTyrosineKinaseActivity",
            "human_readable": "Tyrosine Kinase",
            "go_id": "GO0004713"
        }, {
            "short_name": "SerineThreonineKinase",
            "full_go_name": "ProteinSerineThreonineKinaseActivity",
            "human_readable": "Serine Threonine Kinase",
            "go_id": "GO0004674"
        }, {
            "short_name": "ProteinPhosphatase",
            "full_go_name": "PhospoproteinPhosphataseActivity",
            "human_readable": "Protein Phosphatase",
            "go_id": "GO0004721"
        }, {
            "short_name": "GProteinCoupledReceptor",
            "full_go_name": "GpcrActivity",
            "human_readable": "G Protein Coupled Receptor",
            "go_id": "GO0004930"
        }, {
            "short_name": "NeutralZincMetallopeptidases",
            "full_go_name": "MetallopeptidaseActivity",
            "human_readable": "Neutral Zinc Metallopeptidase",
            "go_id": "GO0008237"
        }, {
            "short_name": "ABCTransporter",
            "full_go_name": "ABCTransporterActivity",
            "human_readable": "ABC Transporter",
            "go_id": "GO0042626"
        }, {
            "short_name": "RNADirectedDNAPolymerase",
            "full_go_name": "RNADirectedDnaPolymeraseActivity",
            "human_readable": "RNA Directed DNA Polymerase",
            "go_id": "GO0003964"
        }, {
            "short_name": "Transporter",
            "full_go_name": "TransporterActivity",
            "human_readable": "Transporter",
            "go_id": "GO0005215"
        }, {
            "short_name": "IonChannel",
            "full_go_name": "IonChannelActivity",
            "human_readable": "Ion Channel",
            "go_id": "GO0005216"
        }, {
            "short_name": "NuclearHormoneReceptor",
            "full_go_name": "LigandDependentNuclearReceptorActivity",
            "human_readable": "Nuclear Hormone Receptor",
            "go_id": "GO0004879"
        }, {
            "short_name": "LipidKinase",
            "full_go_name": "LipidKinaseActivity",
            "human_readable": "Lipid Kinase",
            "go_id": "GO0001727"
        }, {
            "short_name": "Phospholipase",
            "full_go_name": "PhospholipaseActivity",
            "human_readable": "Phospholipase",
            "go_id": "GO0004620"
        }, {
            "short_name": "ProteaseInhibitorActivity",
            "full_go_name": "PeptidaseInhibitorActivity",
            "human_readable": "Protease Inhibitor",
            "go_id": "GO0030414"
        }, {
            "short_name": "DNARepair",
            "full_go_name": "DnaRepair",
            "human_readable": "DNA Repair",
            "go_id": "GO0006281"
        }, {
            "short_name": "CellSurface",
            "full_go_name": "CellSurface",
            "human_readable": "Cell Surface",
            "go_id": "GO0009986"
        }, {
            "short_name": "ExternalSideOfPlasmaMembrane",
            "full_go_name": "ExternalSideOfPlasmaMembrane",
            "human_readable": "External Side Of Plasma Membrane",
            "go_id": "GO0009897"
        }, {
            "short_name": "GrowthFactor",
            "full_go_name": "GrowthFactorActivity",
            "human_readable": "Growth Factor",
            "go_id": "GO0008083"
        }, {
            "short_name": "HormoneActivity",
            "full_go_name": "HormoneActivity",
            "human_readable": "Hormone Activity",
            "go_id": "GO0005179"
        }, {
            "short_name": "TumorSuppressor",
            "full_go_name": "RegulationOfCellCycle",
            "human_readable": "Tumor Suppressor",
            "go_id": "GO0051726"
        }, {
            "short_name": "TranscriptionFactorBinding",
            "full_go_name": "TranscriptionFactorBinding",
            "human_readable": "Transcription Factor Binding",
            "go_id": "GO0008134"
        }, {
            "short_name": "TranscriptionFactorComplex",
            "full_go_name": "TranscriptionFactorComplex",
            "human_readable": "Transcription Factor Complex",
            "go_id": "GO0005667"
        }, {
            "short_name": "HistoneModification",
            "full_go_name": "HistoneModification",
            "human_readable": "Histone Modification",
            "go_id": "GO0016570"
        }, {
            "short_name": "DrugMetabolism",
            "full_go_name": "DrugMetabolism",
            "human_readable": "Drug Metabolism",
            "go_id": "GO0017144"
        }, {
            "short_name": "DrugResistance",
            "full_go_name": "ResponseToDrug",
            "human_readable": "Drug Resistance",
            "go_id": "GO0042493"
        }, {
            "short_name": "ProteaseActivity",
            "full_go_name": "PeptidaseActivity",
            "human_readable": "Protease",
            "go_id": "GO0008233"
        }]

        #TODO: Use http://www.ebi.ac.uk/QuickGO/GHistory#info=2 for versioning.
        self.version = Version('GO', append_date=True)
        self.logged_version = self.version.last_logged_version()

        self.rows = []

    def is_current(self):
        """Returns True if local versions of Entrez files are up-to-date."""
        return self.version.is_current()

    def download_files(self):
        """Download and extract the gene2accession and gene_info files"""
        go_ids = [
            ':'.join((x['go_id'][:2], x['go_id'][2:]))
            for x in self.dgidb_go_terms
        ]
        url = 'http://www.ebi.ac.uk/QuickGO/GAnnotation?format=tsv&limit=-1&gz=false&tax=9606&goid='
        os.makedirs('data/GO', exist_ok=True)
        for go_id in go_ids:
            file = 'data/GO/' + go_id.replace(':', '') + '.tsv'
            try:
                os.remove(file)
            except FileNotFoundError:
                pass
            wget.download(url + go_id, out=file)

    def parse(self):
        self.rows = []
        go_ids = [
            ':'.join((x['go_id'][:2], x['go_id'][2:]))
            for x in self.dgidb_go_terms
        ]
        category_lookup = {
            x['go_id']: x['human_readable'].upper()
            for x in self.dgidb_go_terms
        }

        for go_id in go_ids:
            temp = {}
            category = category_lookup[go_id.replace(':', '')]
            file = 'data/GO/' + go_id.replace(':', '') + '.tsv'
            with open(file, 'r') as f:
                reader = csv.DictReader(f, delimiter='\t')
                for row in reader:
                    if row['Symbol'] == '-' or ' ' in row['Symbol']:
                        continue
                    try:
                        temp[row['Symbol']].add(row['ID'])
                    except KeyError:
                        temp[row['Symbol']] = set((row['ID'], ))
            for symbol in temp:
                row = {
                    'Symbol': symbol,
                    'Category': category,
                    'IDs': '|'.join(temp[symbol])
                }
                self.rows.append(row)

    def write(self):
        fieldnames = ['Symbol', 'IDs', 'Category']
        with open('data/go.human.tsv', 'w') as f:
            writer = csv.DictWriter(f,
                                    delimiter='\t',
                                    fieldnames=fieldnames,
                                    extrasaction='ignore')
            writer.writeheader()
            for row in self.rows:
                writer.writerow(row)
        self.version.write_log()

    def update(self):
        if not self.is_current():
            #self.download_files()
            self.parse()
            self.write()
예제 #5
0
class DrugBank():

    def __init__(self):
        self.online_version = None
        self.get_online_version()
        self.version = Version('DrugBank', version=self.online_version)
        self.logged_version = self.version.last_logged_version()
        self.interactions = self.drug_info = None

    def is_current(self):
        """Returns True if local versions of Entrez files are up-to-date."""
        return self.version.is_current()

    def get_online_version(self):
        print('Checking DrugBank Version...')
        html = urlopen('http://www.drugbank.ca/downloads')
        bsObj = BeautifulSoup(html.read(), "html.parser")
        r = re.compile(r'Version ([\d\.]+)')
        match = r.search(bsObj.h1.text)
        if match:
            self.online_version = match.group(1)
        else:
            raise ValueError('Error loading online version.')

    @staticmethod
    def download_files():
        print('Downloading DrugBank XML...')
        filename = wget.download("http://www.drugbank.ca/system/downloads/current/drugbank.xml.zip")

        print('\nExtracting DrugBank XML...')
        zfile = zipfile.ZipFile(filename)
        zfile.extract('drugbank.xml', 'data')
        os.remove(filename)
        e = Entrez()
        e.update()

    def parse(self):
        print('Parsing Entrez...')
        symbol_to_info = dict()
        hgnc_id_to_info = dict()
        entrez_to_info = dict()
        sources = set()
        with open('data/gene_info.human') as f:
            c = csv.reader(f, delimiter='\t')
            for i, line in enumerate(c):
                if i == 0:
                    continue
                if line[0] != '9606':
                    continue
                gene_symbol = line[2]
                entrez_id = line[1]
                symbol_to_info[gene_symbol] = {'Entrez': entrez_id,
                                               'Symbol': gene_symbol}
                if line[5] == '-':
                    continue
                synonyms = line[5].split('|')
                for synonym in synonyms:
                    (source, accession) = synonym.split(':', 1)
                    symbol_to_info[gene_symbol][source] = accession
                    sources.add(source)
                if 'HGNC' in symbol_to_info[gene_symbol]:
                    hgnc_id_to_info[symbol_to_info[gene_symbol]['HGNC']] = symbol_to_info[gene_symbol]
                entrez_to_info[entrez_id] = symbol_to_info[gene_symbol]

        uniprot_to_entrez = dict()
        r = re.compile(r'[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}')
        # regex from: http://www.uniprot.org/help/accession_numbers
        with open('data/gene2accession.human') as f:
            c = csv.reader(f, delimiter='\t')
            for i, line in enumerate(c):
                if i == 0:
                    continue
                if line[0] != '9606':
                    continue
                uniprot_id = line[5].split('.',1)[0]
                if not r.match(uniprot_id):
                    continue
                entrez_id = line[1]
                uniprot_to_entrez[uniprot_id] = entrez_id

        print('Parsing DrugBank XML...')
        ns = {'entry': 'http://www.drugbank.ca'}

        tree = ET.parse('data/drugbank.xml')
        drugbank = tree.getroot()
        drugs = drugbank.findall('entry:drug', ns)

        interactions = dict()
        drug_info = dict()
        uniprot_fail = uniprot_success = 0
        hgnc_fail = hgnc_success = 0
        no_info = info = no_ensembl = 0
        total = 0
        for drug in drugs:
            drug_id = drug.find('entry:drugbank-id', ns).text
            drug_name = drug.find('entry:name', ns).text

            synonyms = drug.find('entry:synonyms', ns)
            drug_synonyms = set()
            for synonym in synonyms:
                language = synonym.get('language')
                if language == '' or language == 'English':
                    drug_synonyms.add(synonym.text)
            drug_cas_number = drug.find('entry:cas-number',ns).text
            drug_brands = set()
            for product in drug.find('entry:products', ns):
                drug_brands.add(product.find('entry:name', ns).text)
            for int_brand in drug.find('entry:international-brands', ns):
                drug_brands.add(int_brand.find('entry:name', ns).text)
            drug_type = drug.get('type')
            drug_groups = set()
            for group in drug.find('entry:groups', ns):
                drug_groups.add(group.text)
            drug_categories = set()
            for category in drug.find('entry:categories', ns):
                drug_categories.add(category.find('entry:category', ns).text.lower())
            targets = drug.find('entry:targets', ns)
            if len(targets) == 0:
                continue
            drug_info[drug_id] = (drug_name, tuple(sorted(drug_synonyms)), drug_cas_number,
                                  tuple(sorted(drug_brands)), drug_type,
                                  tuple(sorted(drug_groups)), tuple(sorted(drug_categories)))
            for target in targets:
                organism = target.find('entry:organism', ns).text
                if organism != 'Human':
                    continue
                gene_id = target.find('entry:id',ns).text
                known_action = target.find('entry:known-action', ns).text
                target_actions = set()
                for action in target.find('entry:actions', ns):
                    target_actions.add(action.text)
                gene_symbol = hgnc_gene_acc = uniprot_id = entrez_id = ensembl_id = None
                polypeptide = target.find('entry:polypeptide',ns)
                synonyms = None
                if polypeptide is not None:
                    gene_symbol = polypeptide.find('entry:gene-name', ns).text
                    for identifier in polypeptide.find('entry:external-identifiers',ns):
                        if identifier.find('entry:resource',ns).text == 'HUGO Gene Nomenclature Committee (HGNC)':
                            hgnc_gene_acc = identifier.find('entry:identifier',ns).text
                            # Some identifiers are incorrectly labeled by DrugBank
                            r = re.compile(r'^\d+$')
                            if hgnc_gene_acc.startswith('GNC:'):
                                hgnc_gene_acc = 'H' + hgnc_gene_acc
                            elif r.match(hgnc_gene_acc):
                                hgnc_gene_acc = 'HGNC:' + hgnc_gene_acc
                            try:
                                synonyms = hgnc_id_to_info[hgnc_gene_acc]
                            except:
                                hgnc_fail += 1
                            else:
                                entrez_id = synonyms['Entrez']
                                try:
                                    ensembl_id = synonyms['Ensembl']
                                except KeyError:
                                    no_ensembl += 1
                                hgnc_success += 1
                        elif identifier.find('entry:resource',ns).text == 'UniProtKB':
                            uniprot_id = identifier.find('entry:identifier',ns).text
                            if not synonyms:
                                try:
                                    entrez_id = uniprot_to_entrez[uniprot_id]
                                    synonyms = entrez_to_info[entrez_id]
                                except KeyError:
                                    uniprot_fail += 1
                                else:
                                    uniprot_success += 1
                    if not synonyms:
                        try:
                            synonyms = symbol_to_info[gene_symbol]
                        except KeyError:
                            no_info += 1
                        else:
                            entrez_id = synonyms['Entrez']
                            ensembl_id = synonyms['Ensembl']
                            info += 1
                interaction_tuple = (gene_id, known_action, tuple(sorted(target_actions)),
                                     gene_symbol, uniprot_id, entrez_id, ensembl_id)
                total += 1
                try:
                    interactions[drug_id].append(interaction_tuple)
                except KeyError:
                    interactions[drug_id] = [interaction_tuple, ]
        self.interactions = interactions
        self.drug_info = drug_info

    def write(self):
        print('Writing to .tsv...')
        i = 0
        no_ensembl = no_entrez = total = 0
        header = ('count','drug_id','drug_name','drug_synonyms','drug_cas_number','drug_brands',
                  'drug_type','drug_groups','drug_categories','gene_id','known_action','target_actions',
                  'gene_symbol','uniprot_id','entrez_id','ensembl_id')
        with open('data/DrugBankInteractions.tsv', 'w') as f:
            writer = csv.writer(f, delimiter='\t')
            writer.writerow(header)
            for drug in sorted(self.interactions):
                for interaction in self.interactions[drug]:
                    i += 1
                    data = (i, drug) + self.drug_info[drug] + interaction
                    out = list()
                    for datum in data:
                        if isinstance(datum, tuple):
                            datum = ','.join(datum)
                        datum = str(datum).replace("\t", '')
                        if not datum or datum == 'None':
                            datum = 'N/A'
                        out.append(datum)
                        # Some small number of rows contain tabs within text.
                    if out[14] == 'N/A':
                        no_entrez += 1
                    if out[15] == 'N/A':
                        no_ensembl += 1
                    writer.writerow(out)
        self.version.write_log()

    def update(self):
        if not self.is_current():
            self.download_files()
            self.parse()
            self.write()
예제 #6
0
파일: get_entrez.py 프로젝트: ptdtan/dgi-db
class Entrez:

    def __init__(self):
        self.online_version = None
        self.get_online_version()
        self.version = Version('Entrez', version=self.online_version)
        self.logged_version = self.version.last_logged_version()

    def is_current(self):
        """Returns True if local versions of Entrez files are up-to-date."""
        return self.version.is_current()

    def get_online_version(self):
        # This assumes that if gene2accession needs updating, so will other Entrez files.
        html = urlopen('http://ftp.ncbi.nlm.nih.gov/gene/DATA/')
        bsObj = BeautifulSoup(html.read(), "html.parser")
        a = bsObj.hr.find('a', {"href": "gene2accession.gz"})
        self.online_version = datetime.datetime.strptime(a.next.next.split()[0], '%d-%b-%Y').strftime('%d-%B-%Y')

    @staticmethod
    def extract(file):
        with gzip.open(file, 'rb') as rf:
            with open('data/' + file.rsplit('.', 1)[0] + '.human', 'w') as wf:
                for i, line in enumerate(rf):
                    line_ascii = line.decode('utf-8')
                    if i == 0:
                        wf.write(line_ascii)
                    else:
                        species = line_ascii.split()[0]
                        if species == '9606':  # Grab human only
                            wf.write(line_ascii)

    @staticmethod
    def download_files():
        """Download and extract the gene2accession and gene_info files"""
        print('Downloading Entrez Accessions...')
        print('gene_info:')
        g2a_filename = wget.download("ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2accession.gz")
        print('\ngene2accession:')
        gi_filename = wget.download("ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene_info.gz")
        print('\ninteractions:')
        ia_filename = wget.download("ftp://ftp.ncbi.nlm.nih.gov/gene/GeneRIF/interactions.gz")
        print('\nExtracting Entrez Accessions...')
        print('gene_info...')
        Entrez.extract("gene_info.gz")
        print('gene2accession...')
        Entrez.extract("gene2accession.gz")
        print('interactions...')
        Entrez.extract("interactions.gz")
        os.remove(g2a_filename)
        os.remove(gi_filename)
        os.remove(ia_filename)

    def parse(self):
        self.rows = []
        with open('data/gene_info.human') as f:
            fieldnames = ['tax_id', 'entrez_id', 'entrez_gene_symbol', 'locus_tag',
                         'entrez_gene_synonyms', 'dbXrefs', 'chromosome', 'map_loc',
                         'description', 'type', 'sym_from_auth', 'full_from_auth',
                         'nom_status', 'other_designations', 'mod_date']
            reader = csv.DictReader(f, delimiter='\t', fieldnames=fieldnames)
            for i, row in enumerate(reader):
                if i == 0:
                    continue
                ensembl = set()
                for key, value in row.items():
                    if value == '-':
                        row[key] = 'N/A'
                dbXrefs = row['dbXrefs'].split('|')
                for xRef in dbXrefs:
                    if xRef == 'N/A':
                        continue
                    source, label = xRef.split(':', 1)
                    if source == 'Ensembl':
                        ensembl.add(label)
                row['ensembl_ids'] = '|'.join(ensembl) or 'N/A'
                self.rows.append(row)

    def write(self):
        with open('data/entrez_genes.tsv', 'w') as f:
            fieldnames = ['entrez_id', 'entrez_gene_symbol', 'entrez_gene_synonyms',
                          'ensembl_ids', 'description']
            writer = csv.DictWriter(f, delimiter='\t', fieldnames=fieldnames, extrasaction='ignore')
            writer.writeheader()
            writer.writerows(self.rows)
        self.version.write_log()

    def update(self):
        if not self.is_current():
            self.download_files()
            self.parse()
            self.write()
예제 #7
0
class DrugBank():
    def __init__(self):
        self.online_version = None
        self.get_online_version()
        self.version = Version('DrugBank', version=self.online_version)
        self.logged_version = self.version.last_logged_version()
        self.interactions = self.drug_info = None

    def is_current(self):
        """Returns True if local versions of Entrez files are up-to-date."""
        return self.version.is_current()

    def get_online_version(self):
        print('Checking DrugBank Version...')
        html = urlopen('http://www.drugbank.ca/downloads')
        bsObj = BeautifulSoup(html.read(), "html.parser")
        r = re.compile(r'Version ([\d\.]+)')
        match = r.search(bsObj.h1.text)
        if match:
            self.online_version = match.group(1)
        else:
            raise ValueError('Error loading online version.')

    @staticmethod
    def download_files():
        print('Downloading DrugBank XML...')
        filename = wget.download(
            "http://www.drugbank.ca/system/downloads/current/drugbank.xml.zip")

        print('\nExtracting DrugBank XML...')
        zfile = zipfile.ZipFile(filename)
        zfile.extract('drugbank.xml', 'data')
        os.remove(filename)
        e = Entrez()
        e.update()

    def parse(self):
        print('Parsing Entrez...')
        symbol_to_info = dict()
        hgnc_id_to_info = dict()
        entrez_to_info = dict()
        sources = set()
        with open('data/gene_info.human') as f:
            c = csv.reader(f, delimiter='\t')
            for i, line in enumerate(c):
                if i == 0:
                    continue
                if line[0] != '9606':
                    continue
                gene_symbol = line[2]
                entrez_id = line[1]
                symbol_to_info[gene_symbol] = {
                    'Entrez': entrez_id,
                    'Symbol': gene_symbol
                }
                if line[5] == '-':
                    continue
                synonyms = line[5].split('|')
                for synonym in synonyms:
                    (source, accession) = synonym.split(':', 1)
                    symbol_to_info[gene_symbol][source] = accession
                    sources.add(source)
                if 'HGNC' in symbol_to_info[gene_symbol]:
                    hgnc_id_to_info[symbol_to_info[gene_symbol]
                                    ['HGNC']] = symbol_to_info[gene_symbol]
                entrez_to_info[entrez_id] = symbol_to_info[gene_symbol]

        uniprot_to_entrez = dict()
        r = re.compile(
            r'[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}'
        )
        # regex from: http://www.uniprot.org/help/accession_numbers
        with open('data/gene2accession.human') as f:
            c = csv.reader(f, delimiter='\t')
            for i, line in enumerate(c):
                if i == 0:
                    continue
                if line[0] != '9606':
                    continue
                uniprot_id = line[5].split('.', 1)[0]
                if not r.match(uniprot_id):
                    continue
                entrez_id = line[1]
                uniprot_to_entrez[uniprot_id] = entrez_id

        print('Parsing DrugBank XML...')
        ns = {'entry': 'http://www.drugbank.ca'}

        tree = ET.parse('data/drugbank.xml')
        drugbank = tree.getroot()
        drugs = drugbank.findall('entry:drug', ns)

        interactions = dict()
        drug_info = dict()
        uniprot_fail = uniprot_success = 0
        hgnc_fail = hgnc_success = 0
        no_info = info = no_ensembl = 0
        total = 0
        for drug in drugs:
            drug_id = drug.find('entry:drugbank-id', ns).text
            drug_name = drug.find('entry:name', ns).text

            synonyms = drug.find('entry:synonyms', ns)
            drug_synonyms = set()
            for synonym in synonyms:
                language = synonym.get('language')
                if language == '' or language == 'English':
                    drug_synonyms.add(synonym.text)
            drug_cas_number = drug.find('entry:cas-number', ns).text
            drug_brands = set()
            for product in drug.find('entry:products', ns):
                drug_brands.add(product.find('entry:name', ns).text)
            for int_brand in drug.find('entry:international-brands', ns):
                drug_brands.add(int_brand.find('entry:name', ns).text)
            drug_type = drug.get('type')
            drug_groups = set()
            for group in drug.find('entry:groups', ns):
                drug_groups.add(group.text)
            drug_categories = set()
            for category in drug.find('entry:categories', ns):
                drug_categories.add(
                    category.find('entry:category', ns).text.lower())
            targets = drug.find('entry:targets', ns)
            if len(targets) == 0:
                continue
            drug_info[drug_id] = (drug_name, tuple(sorted(drug_synonyms)),
                                  drug_cas_number, tuple(sorted(drug_brands)),
                                  drug_type, tuple(sorted(drug_groups)),
                                  tuple(sorted(drug_categories)))
            for target in targets:
                organism = target.find('entry:organism', ns).text
                if organism != 'Human':
                    continue
                gene_id = target.find('entry:id', ns).text
                known_action = target.find('entry:known-action', ns).text
                target_actions = set()
                for action in target.find('entry:actions', ns):
                    target_actions.add(action.text)
                gene_symbol = hgnc_gene_acc = uniprot_id = entrez_id = ensembl_id = None
                raw_refs = target.find('entry:references', ns).text
                refs_regex = re.compile(
                    r'"Pubmed":http://www.ncbi.nlm.nih.gov/pubmed/(\d+)')
                references = set()
                try:
                    for string in raw_refs.split('#'):
                        match = refs_regex.search(string)
                        if match:
                            references.add(match.group(1))
                except AttributeError:
                    pass
                references = tuple(references)
                polypeptide = target.find('entry:polypeptide', ns)
                synonyms = None
                if polypeptide is not None:
                    gene_symbol = polypeptide.find('entry:gene-name', ns).text
                    for identifier in polypeptide.find(
                            'entry:external-identifiers', ns):
                        if identifier.find(
                                'entry:resource', ns
                        ).text == 'HUGO Gene Nomenclature Committee (HGNC)':
                            hgnc_gene_acc = identifier.find(
                                'entry:identifier', ns).text
                            # Some identifiers are incorrectly labeled by DrugBank
                            r = re.compile(r'^\d+$')
                            if hgnc_gene_acc.startswith('GNC:'):
                                hgnc_gene_acc = 'H' + hgnc_gene_acc
                            elif r.match(hgnc_gene_acc):
                                hgnc_gene_acc = 'HGNC:' + hgnc_gene_acc
                            try:
                                synonyms = hgnc_id_to_info[hgnc_gene_acc]
                            except:
                                hgnc_fail += 1
                            else:
                                entrez_id = synonyms['Entrez']
                                try:
                                    ensembl_id = synonyms['Ensembl']
                                except KeyError:
                                    no_ensembl += 1
                                hgnc_success += 1
                        elif identifier.find('entry:resource',
                                             ns).text == 'UniProtKB':
                            uniprot_id = identifier.find(
                                'entry:identifier', ns).text
                            if not synonyms:
                                try:
                                    entrez_id = uniprot_to_entrez[uniprot_id]
                                    synonyms = entrez_to_info[entrez_id]
                                except KeyError:
                                    uniprot_fail += 1
                                else:
                                    uniprot_success += 1
                    if not synonyms:
                        try:
                            synonyms = symbol_to_info[gene_symbol]
                        except KeyError:
                            no_info += 1
                        else:
                            entrez_id = synonyms['Entrez']
                            ensembl_id = synonyms['Ensembl']
                            info += 1
                interaction_tuple = (gene_id, known_action,
                                     tuple(sorted(target_actions)),
                                     gene_symbol, uniprot_id, entrez_id,
                                     ensembl_id, references)
                total += 1
                try:
                    interactions[drug_id].append(interaction_tuple)
                except KeyError:
                    interactions[drug_id] = [
                        interaction_tuple,
                    ]
        self.interactions = interactions
        self.drug_info = drug_info

    def write(self):
        print('Writing to .tsv...')
        i = 0
        no_ensembl = no_entrez = total = 0
        header = ('count', 'drug_id', 'drug_name', 'drug_synonyms',
                  'drug_cas_number', 'drug_brands', 'drug_type', 'drug_groups',
                  'drug_categories', 'gene_id', 'known_action',
                  'target_actions', 'gene_symbol', 'uniprot_id', 'entrez_id',
                  'ensembl_id', 'pmid')
        with open('data/DrugBankInteractions.tsv', 'w') as f:
            writer = csv.writer(f, delimiter='\t')
            writer.writerow(header)
            for drug in sorted(self.interactions):
                for interaction in self.interactions[drug]:
                    i += 1
                    data = (i, drug) + self.drug_info[drug] + interaction
                    out = list()
                    for datum in data:
                        if isinstance(datum, tuple):
                            datum = ','.join(datum)
                        datum = str(datum).replace("\t", '')
                        if not datum or datum == 'None':
                            datum = 'N/A'
                        out.append(datum)
                        # Some small number of rows contain tabs within text.
                    if out[14] == 'N/A':
                        no_entrez += 1
                    if out[15] == 'N/A':
                        no_ensembl += 1
                    writer.writerow(out)
        self.version.write_log()

    def update(self):
        if not self.is_current():
            self.download_files()
            self.parse()
            self.write()