Пример #1
0
    def __init__(self,
                 wget=os.path.join("/", "usr", "bin", "wget"),
                 gunzip=os.path.join("/", "usr", "bin", "gunzip")):
        """  
        Constructor
        """

        config = Configure()

        ## check that we are in linux or osx
        if _platform == "linux" or _platform == "linux2":
            pass
        elif _platform == "darwin":
            pass
        elif _platform == "win32":
            raise Exception("DatabaseFetch currently does not work on windows platforms\n"+\
                            "you may still download the files one by one and populated the db")

        ## ensure they have set up config
        for key in ['data', 'dbname']:
            if config.log[key] == '':
                raise Exception(
                    "You must modify the config file before running DatabaseFetch.py"
                )

        dataDir = config.log['data']

        if not os.path.isdir(dataDir):
            raise Exception(
                "Specified htsint data directory does not exist %s" % dataDir)

        ## move into specified directory
        self.cwd = os.getcwd()
        os.chdir(dataDir)

        ## check for the necessary programs
        for wgetPath in [
                wget, os.path.join('/', 'usr', 'local', 'bin', 'wget')
        ]:
            if os.path.exists(wgetPath) == True:
                break
            wgetPath = None

        if wgetPath == None:
            raise Exception(
                "ERROR: cannot find wget -- either download the files (see documentation) or specify a path"
            )
        self.wgetPath = wgetPath

        gunzipPaths = [gunzip, os.path.join("/", "bin", "gunzip")]
        for gunzipPath in gunzipPaths:
            if os.path.exists(gunzipPath) == True:
                break
            gunzipPath = None

        if gunzipPath == None:
            raise Exception(
                "ERROR: cannot find gunzip -- either download the files (see documentation) or specify a path"
            )
        self.gunzipPath = gunzipPath
Пример #2
0
def get_file_sizes():
    """
    return a unique dict of geneids with refseq values from the idmapping file

    """

    config = Configure()
    taxaList = config.log['taxa']
    geneInfoCount = read_gene_info_file(lineCount=True)
    idmappingFile = get_idmapping_file()
    idmappingFid = open(idmappingFile, 'rU')
    reader = csv.reader(idmappingFid, delimiter="\t")
    records = set([])
    totalRecords = 0

    for record in reader:
        totalRecords += 1
        if record[1] == 'ncbi-taxa-id' and record[2] not in taxaList:
            continue
        records.update([record[0]])

    idmappingFid.close()
    print 'debug. totalRecords %s, records %s, geneInfoCount %s' % (
        totalRecords, len(list(records)), geneInfoCount)
    return len(list(records)), geneInfoCount
Пример #3
0
    def setUp(self):
        """
        connect to the database
        """

        self.queryFile = os.path.join(os.path.dirname(__file__), "adh.fasta")
        config = Configure()
        self.BLASTDB = config.log['data']
Пример #4
0
def get_gene2go_file():
    """
    check for presence of the annotation file
    raise exception when not found
    return the file path
    """

    config = Configure()
    dataDir = config.log['data']
    annotationFile = os.path.join(dataDir,'gene2go.db')
    if os.path.exists(annotationFile) == False:
        raise Exception("Could not find 'gene2go' -- did you run FetchDbData.py?")

    return annotationFile
Пример #5
0
def get_ontology_file():
    """
    check for presence of ontology file
    raise exception when not found
    return the file path
    """

    config = Configure()
    dataDir = config.log['data']
    ontologyFile = os.path.join(dataDir,'go.obo')
    if os.path.exists(ontologyFile) == False:
        raise Exception("Could not find 'go.obo' -- did you run DatabaseFetch?")

    return ontologyFile
Пример #6
0
def get_idmapping_file():
    """
    check for presence of the annotation file
    raise exception when not found
    return the file path 
    """

    config = Configure()
    dataDir = config.log['data']
    idmappingFile = os.path.join(dataDir,'idmapping.dat.db')
    if os.path.exists(idmappingFile) == False:
        raise Exception("Could not find 'idmapping.dat.db' -- did you run FetchDbData.py?")

    return idmappingFile
Пример #7
0
def get_annotation_file():
    """
    check for presence of the annotation file
    raise exception when not found
    return the file path
    """

    fileName = 'gene_association.goa_uniprot.db'
    config = Configure()
    dataDir = config.log['data']
    annotationFile = os.path.join(dataDir,fileName)
    if os.path.exists(annotationFile) == False:
        raise Exception("Could not find '%s' -- did you run FetchDbData.py?"%(fileName))

    return annotationFile
Пример #8
0
def read_gene_info_file(lineCount=False, short=False):
    """
    read the essential info from NCBI's gene info file
    """

    config = Configure()
    taxaList = config.log['taxa']
    geneInfoFile = os.path.join(config.log['data'], "gene_info.db")
    geneInfoFid = open(geneInfoFile, 'rU')
    header = geneInfoFid.next()
    geneInfo = {}
    totalLines = 0

    for record in geneInfoFid:
        record = record.rstrip("\n")
        record = record.split("\t")

        if re.search("^\#", record[0]):
            continue

        taxId = record[0]
        ncbiId = record[1]

        if taxId not in taxaList:
            continue

        if lineCount == True:
            totalLines += 1
            continue

        symbol = record[2]
        synonyms = record[4]
        chromosome = record[6]
        map_location = record[7]
        description = record[8]
        if short == False:
            geneInfo[ncbiId] = [
                taxId, symbol, synonyms, chromosome, map_location, description
            ]
        else:
            geneInfo[ncbiId] = taxId

    geneInfoFid.close()

    if lineCount == True:
        return totalLines
    else:
        return geneInfo
Пример #9
0
def print_db_summary():
    """
    print a summary of rows and tables for the database
    """
    
    print('querying database...')
    config = Configure()
    printstr = ""
    session,engine = db_connect(verbose=False)
    printstr += "\nDATABASE - %s - SUMMARY"%config.log['dbname'] + "\n"
    print("\nDATABASE - %s - SUMMARY"%config.log['dbname'])
    for table in [Taxon,Gene,Uniprot,GoTerm,GoAnnotation]:
        print("There are %s entries in the %s table"%(session.query(table).count(),table.__tablename__))
        printstr += "There are %s entries in the %s table"%(session.query(table).count(),table.__tablename__) + "\n"

    return printstr
Пример #10
0
def ask_upass():
    """
    returns the pass word for the database
    """


    config = Configure()

    for key in ['data','dbname']:
        if config.log[key] == '':
            raise Exception("You must modify the config file before running DatabaseFetch.py")
    
    upass = config.log['dbpass']
    if upass == '':
        upass = getpass.getpass()

    return upass
Пример #11
0
    def __init__(self):
        """
        Constructor
        """

        self.config = Configure()

        ## ensure config is setup 
        for key in ['data','dbname']:
            if self.config.log[key] == '':
                raise Exception("You must modify the config file before running DatabaseFetch.py")

        dataDir = self.config.log['data']

        if not os.path.isdir(dataDir):
            raise Exception("Specified htsint data directory does not exist %s"%dataDir)

        self.taxaList = self.config.log['taxa']
Пример #12
0
def db_connect(verbose=False, upass=''):
    """
    generic function to connect to db

    """

    config = Configure()

    for key in ['data', 'dbname']:
        if config.log[key] == '':
            raise Exception(
                "You must modify the config file before running DatabaseFetch.py"
            )

    check_version()

    ## declare variables
    uname = config.log['dbuser']
    dbhost = config.log['dbhost']
    dbname = config.log['dbname']
    port = config.log['dbport']

    ## get data base parameters
    if upass == '':
        upass = ask_upass()

    if dbname == '' or port == '' or dbhost == '' or uname == '':
        raise Exception(
            "Invalid database parameters -- parameters not specified in config file"
        )

    ## create connection to db and create necessary tables
    if verbose:
        print "connecting to database: %s" % dbname
    engine = create_engine('postgresql://%s:%s@%s:%s/%s' %
                           (uname, upass, dbhost, port, dbname),
                           echo=verbose)
    connection = engine.connect()
    Session = sessionmaker(bind=engine)
    session = Session()
    if verbose:
        print 'connected.'

    return session, engine
Пример #13
0
def get_total_annotations():
    """
    get the number of annotations in the uniprot file
    this does not include the gene2go file
    """

    config = Configure()
    taxaList = config.log['taxa']

    annotationFile = get_annotation_file()
    annotationFid = open(annotationFile,'rU')
    annotsCount = 0
    annotatedIds = {}
    totalAnnotations = 0

    for record in annotationFid:
        record = record[:-1].split("\t")
        if record[0][0] == "!":
            continue
        if record[0] != 'UniProtKB':
            continue

        taxon = re.sub("taxon:","",record[12])
        if taxon == "" or re.search("\|",taxon):
            continue
        if taxon not in taxaList:
            continue

        totalAnnotations += 1

    gene2goFile = get_gene2go_file()
    gene2goFid = open(gene2goFile,'rU')
    header = gene2goFid.next()

    for record in gene2goFid:
        totalAnnotations += 1

    return totalAnnotations
Пример #14
0
def populate_uniprot_table(lineCount,session,engine):
    """
    populate the uniprot table with entries from idmappings
    """

    config = Configure()
    taxaList = config.log['taxa']
    timeStart = time.time()
    totalLines,totalRecords = 0,0
    idmappingFile = get_idmapping_file()
    idmappingFid = open(idmappingFile,'rb')
    reader = csv.reader(idmappingFid,delimiter="\t")
    ac2kbMap,toAdd = {},{}
    wayPoints = [round(int(w)) for w in np.linspace(0,lineCount,20)]

    print("getting mappers...")
    geneIdMap = gene_mapper(session)
    taxonIdMap = taxa_mapper(session)
    print("mappers loaded... %s"%time.strftime('%H:%M:%S',time.gmtime(time.time()-timeStart)))

    def queue_entries(toAdd,geneIdMap,taxonIdMap,engine):

        toCommit = []

        for uniprotKbEntry, entry in toAdd.iteritems():
            db_gene_id = None
            db_taxa_id = None
            db_gene_taxa_id = None

            ## convert the gene id to a database key (check old names if we cannot find it)
            if entry['gene-id'] == None:
                pass
            elif entry['gene-id'] in geneIdMap:
                db_gene_id = geneIdMap[entry['gene-id']]
            elif entry['gene-id'] not in geneIdMap:
                _geneIds = [re.sub("\s+","",_ncid) for _ncid in entry['gene-id'].split(";")]
                db_gene_id = None
        
                for _gid in _geneIds:
                    if _gid in geneIdMap:
                        db_gene_id= _gid

            ## convert the taxa id to a database key
            if entry['ncbi-taxa-id'] and entry['ncbi-taxa-id'] in taxonIdMap:
                db_taxa_id = taxonIdMap[entry['ncbi-taxa-id']]

            ## check that the linked gene taxa is the same as the entry taxa
            #if db_gene_id:
            #    db_gene_taxa_id = session.query(Gene).filter_by(id=db_gene_id).first().taxa_id
            #if db_taxa_id and db_gene_id:
            #    if db_taxa_id != db_gene_taxa_id:
            #        print("WARNING: two taxa present in single uniprot entry? %s %s "%(uniprotKbEntry,\
            #                                                                           entry['gene-id']))

            ## if no taxa was provdied use the one assocated with the linked gene
            #if not db_taxa_id:
            #    dgeneQuery = session.query(Gene).filter_by(id=db_gene_id).first()
            #    if dgeneQuery:
            #        db_taxa_id = dgeneQuery.taxa_id 

            ## ensure we are in appropriate taxa
            if entry['ncbi-taxa-id'] not in taxaList:
                continue

            ## ready the uniprot-ac and refseq rows
            entry['uniprot-ac'] = list(entry['uniprot-ac'])
            if len(entry['uniprot-ac']) == 0:
                entry['uniprot-ac'] = None
            elif  len(entry['uniprot-ac']) == 1:
                entry['uniprot-ac'] = entry['uniprot-ac'][0]
            else:
                entry['uniprot-ac'] = ";".join(entry['uniprot-ac'])

            entry['refseq'] = list(entry['refseq'])
            if len(entry['refseq']) == 0:
                entry['refseq'] = None
            elif  len(entry['refseq']) == 1:
                entry['refseq'] = entry['refseq'][0]
            else:
                entry['refseq'] = ";".join(entry['refseq'])

            ## commit to db
            toCommit.append({'uniprot_ac':entry['uniprot-ac'],'uniprot_entry':uniprotKbEntry,
                             'refseq':entry['refseq'],'taxa_id':db_taxa_id,'gene_id':db_gene_id})
        if len(toCommit) > 0:
            with engine.begin() as connection:
                connection.execute(Uniprot.__table__.insert().
                                   values(toCommit))

    ## parse the idmapping file into the db
    for record in reader:

        if len(record) != 3:
            continue

        uniprotKbAc,uniprotKbEntry,ncbiId,refseq,ncbiTaxaId = None,None,None,None,None
        uniprotKbAc = record[0]
        totalLines += 1
        if totalLines in wayPoints:
            print("\t%s / %s"%(totalLines,lineCount))
        
        if record[1] == 'NCBI_TaxID':
            ncbiTaxaId = record[2]
        elif record[1] == 'GeneID':
            ncbiId = record[2]
        elif record[1] == 'UniProtKB-ID':
            uniprotKbEntry = record[2]
            if uniprotKbAc not in ac2kbMap:
                ac2kbMap[uniprotKbAc] = uniprotKbEntry
        elif record[1] == 'RefSeq':
            refseq = record[2]
        else:
            continue

        ## skip the XXXX-1 like uniprot ac
        if uniprotKbAc not in ac2kbMap:
            continue

        ## get current key
        uniprotKbEntry = ac2kbMap[uniprotKbAc] 

        ## make new entry if necessary
        if uniprotKbEntry and uniprotKbEntry not in toAdd:

            ## queue entries in blocks
            totalRecords += 1 
        
            if totalRecords % 100000 == 0:
                queue_entries(toAdd,geneIdMap,taxonIdMap,engine)
                toAdd,ac2kbMap = {},{}


            ac2kbMap[uniprotKbAc] = uniprotKbEntry
            toAdd[uniprotKbEntry] = {'ncbi-taxa-id':None,
                                     'gene-id':None,
                                     'uniprot-ac':set([]),
                                     'refseq':set([])}
        
        ## populate uniprot dictionary
        toAdd[uniprotKbEntry]['uniprot-ac'].update([uniprotKbAc])

        if ncbiTaxaId:
            toAdd[uniprotKbEntry]['ncbi-taxa-id'] = ncbiTaxaId
        elif ncbiId:
            toAdd[uniprotKbEntry]['gene-id'] = ncbiId
        elif refseq:
            toAdd[uniprotKbEntry]['refseq'].update([refseq])

    ## queue any remaining
    if len(toAdd.keys()) > 0:
        queue_entries(toAdd,geneIdMap,taxonIdMap,engine)

    ## clean up
    idmappingFid.close()
    timeStr = "...total time taken: %s"%time.strftime('%H:%M:%S', time.gmtime(time.time()-timeStart))
    addedStr = "...%s unique uniprot entries were added."%totalRecords
    return timeStr,addedStr
Пример #15
0
def populate_gene_table(geneInfoCount,session,engine):
    """
    use the geneids derived from the idmapping file along with gene_info data to populate the gene table 
    """

    config = Configure()
    taxaList = config.log['taxa']
    timeStart = time.time()
    toAdd = []
    totalRecords = 0
    total = geneInfoCount
    wayPoints = [round(int(w)) for w in np.linspace(0,total,20)]
    geneInfoFile = os.path.join(config.log['data'],"gene_info.db")
    geneInfoFid = open(geneInfoFile,'rU')
    header = geneInfoFid.__next__()
    taxaIdMap = taxa_mapper(session)

    for record in geneInfoFid:
        record = record.rstrip("\n")
        record = record.split("\t")

        if re.search("^\#",record[0]):
            continue

        taxId = record[0]

        if taxId not in taxaList:
            continue

        ncbiId = record[1]
        symbol = record[2]
        synonyms = record[4]
        chromosome = record[6]
        map_location = record[7]
        description = record[8]
       
        ## define the table entry
        toAdd.append({'ncbi_id':ncbiId,'description':description,'symbol':symbol,'synonyms':synonyms,
                      'chromosome':chromosome,'map_location':map_location,'taxa_id':taxId})
        totalRecords += 1
        
        if len(toAdd) >= 200000:
            toRemove = []
            for ta in toAdd:
                if ta['taxa_id'] in taxaIdMap:
                    ta['taxa_id'] = taxaIdMap[ta['taxa_id']]
                else:
                    toRemove.append(ta)

            for ta in toRemove:
                toAdd.remove(ta)
                
            if len(toAdd) > 0:
                with engine.begin() as connection:
                    connection.execute(Gene.__table__.insert().
                                       values(toAdd))
            toAdd = []

        ## show progress
        if totalRecords in wayPoints:
            print("\t%s / %s"%(totalRecords,total))

    print('committing changes...')
    toRemove = []
    for ta in toAdd:
        if ta['taxa_id'] in taxaIdMap:
            ta['taxa_id'] = taxaIdMap[ta['taxa_id']]
        else:
            toRemove.append(ta)

    for ta in toRemove:
        toAdd.remove(ta)

    if len(toAdd) > 0:
        with engine.begin() as connection:
            connection.execute(Gene.__table__.insert().
                               values(toAdd))

    ## clean up
    geneInfoFid.close()

    timeStr = "...total time taken: %s"%time.strftime('%H:%M:%S', time.gmtime(time.time()-timeStart))
    addedStr = "...%s unique genes were added."%totalRecords
    return timeStr,addedStr
Пример #16
0
def populate_taxon_table(engine):
    """
    given a list of taxon ids populate the taxon table    
    populates all taxa
    """
    
    config = Configure()
    namesFile = os.path.join(config.log['data'],"names.dmp")
    if os.path.exists(namesFile) == False:
        raise Exception("Cannot find names.dmp... exiting")

    namesFID = open(namesFile,'rU')
    taxaCount = 0
    timeStart = time.time()
    toAdd = {}
    taxaID = None
    debug = 0

    for linja in namesFID:
        debug += 1

        linja = linja.rstrip("\n")
        linja = linja.split("|")
        linja = [re.sub("\t","",element) for element in linja]

        scientificName,commonName = None,None
        if linja[3] == 'scientific name':
            taxaID = linja[0]
            scientificName = linja[1]
        elif re.search("common name",linja[3]):
            taxaID = linja[0]
            commonName = linja[1]
        else:
            continue

        if taxaID in ['root']:
            continue
                
        ## if record does not exist
        if taxaID not in toAdd:
            taxaCount += 1
            if len(toAdd) >= 300000:
                with engine.begin() as connection:
                    connection.execute(Taxon.__table__.insert().
                                       values(toAdd.values()))
                toAdd = {}

            toAdd[taxaID] = {'ncbi_id':taxaID,'name':None,'common_name_1':None,
                            'common_name_2':None,'common_name_3':None}
        
        ## if record exists add a common name 
        if taxaID in toAdd and scientificName != None:
            toAdd[taxaID]['name'] = scientificName
        elif taxaID in toAdd and commonName != None:
            if  toAdd[taxaID]['common_name_1'] == None:
                toAdd[taxaID]['common_name_1'] = commonName
            elif  toAdd[taxaID]['common_name_2'] == None:
                toAdd[taxaID]['common_name_2'] = commonName
            elif  toAdd[taxaID]['common_name_3'] == None:
                toAdd[taxaID]['common_name_3'] = commonName
        else:
            continue

    print('committing changes...')
    with engine.begin() as connection:
        connection.execute(Taxon.__table__.insert().
                           values(toAdd.values()))
    namesFID.close()
    timeStr = "...total time taken: %s"%time.strftime('%H:%M:%S', time.gmtime(time.time()-timeStart))
    addedStr =  "...%s unique taxa were added."%taxaCount
    return timeStr, addedStr
Пример #17
0
    optlist, args = getopt.getopt(sys.argv[1:], 'v')
except getopt.GetoptError:
    print(getopt.GetoptError)
    print(sys.argv[0] + "-v")
    print("... the verbose flag (-v) may be used")
    sys.exit()

VERBOSE = False
RUNALL = False

for o, a in optlist:
    if o == '-v':
        VERBOSE = True

## ensure config is setup
config = Configure()

if config.log['dbname'] == '':
    raise Exception("Config file is not setup")

## Database tests
from .DatabaseTest import *
DatabaseTestSuite = unittest.TestLoader().loadTestsFromTestCase(DatabaseTest)
DatabaseSuite = unittest.TestSuite([DatabaseTestSuite])

## GeneOntology tests
from .GeneOntologyTest import *
GeneOntologyTestSuite = unittest.TestLoader().loadTestsFromTestCase(
    GeneOntologyTest)
GeneOntologySuite = unittest.TestSuite([GeneOntologyTestSuite])
Пример #18
0
def populate_go_annotations(totalAnnotations,session,engine):
    """
    read the annotation file into a dictionary
    This will take some time
    This function is intended for use with 
    http://www.geneontology.org/GO.format.gaf-2_0.shtml
    """

    timeStart = time.time()
    config = Configure()
    taxaList = config.log['taxa']
    toAdd = []
    annotationFile = get_annotation_file()
    annotationFid = open(annotationFile,'rU')
    wayPoints = [round(int(w)) for w in np.linspace(0,totalAnnotations,20)]
    annotationCount = 0

    print("...loading mappers")
    termIdMap = goterm_mapper(session)
    taxaIdMap = taxa_mapper(session)
    uniprotIdMap = uniprot_mapper(session)
    print("...populating rows")

    def queue_entry(goId,evidenceCode,pubmedRefs,uniprotId,geneId,taxon,toAdd,mapper,ignoredAnnotations):

        ## remove invalid term ids
        if not goId in termIdMap:
            queryTerm = session.query(GoTerm).filter_by(alternate_id=goId).first()
            if queryTerm == None:
                return
            go_db_id = queryTerm.id
        else:
            go_db_id = termIdMap[goId]

        ## remove invalid uniprot ids
        if uniprotId and uniprotId not in mapper:
            return
        if uniprotId:
            uniprot_db_id = mapper[uniprotId]
        else:
            uniprot_db_id = None

        ## remove invalid gene ids
        if geneId and geneId not in mapper:
            return
        if geneId:
            gene_db_id = mapper[geneId]
        else:
            gene_db_id = None

        ## ignore annotations that have an outdated taxon
        if taxon not in taxaIdMap:
            ignoredAnnotations += 1
            return

        ## get the taxa foreign key
        taxon_db_id = taxaIdMap[taxon]

        toAdd.append({'go_term_id':go_db_id,'evidence_code':evidenceCode,
                      'pubmed_refs':pubmedRefs,'uniprot_id':uniprot_db_id,
                      'gene_id':gene_db_id,'taxa_id':taxon_db_id})

    ## add annotations from uniprot annotation file
    ignoredAnnotationsUniprot = 0
    print("...getting annotations from gene_association (uniprot)")
    for record in annotationFid:
        record = record[:-1].split("\t")

        ## check that it is a uniprot entry
        if record[0][0] == "!":
            continue
        if record[0] != 'UniProtKB':
            continue
        
        uniprotId = record[1]
        dbObjectSymbol = record[2]
        goId = record[4]
        pubmedRefs = record[5]
        evidenceCode = record[6]
        aspect = record[8]
        uniprotEntry = record[10]
        goTermName = record[11]
        taxon = re.sub("taxon:","",record[12])
        date = record[13]
        assignedBy = record[14]

        if taxon not in taxaList:
            continue

        ## parse the uniprot Entry
        if re.search("\|",uniprotEntry):
            uniprotEntry = re.split("\|",uniprotEntry)[0]

        ## ignore annotations with multiple species
        if re.search("\|",taxon):
            continue

        ## update progress
        annotationCount += 1
        if annotationCount in wayPoints:
            print("\t%s / %s"%(annotationCount,totalAnnotations))

        queue_entry(goId,evidenceCode,pubmedRefs,uniprotEntry,None,taxon,toAdd,
                    uniprotIdMap,ignoredAnnotationsUniprot)

        if len(toAdd) >= 100000: # 100000
            with engine.begin() as connection:
                connection.execute(GoAnnotation.__table__.insert().
                                   values(toAdd))
            toAdd = []

    print('committing final changes...')
    print('ignored annotations after uniprot... %s'%(ignoredAnnotationsUniprot))
    with engine.begin() as connection:
        connection.execute(GoAnnotation.__table__.insert().
                           values(toAdd))

    del uniprotIdMap
    annotationFid.close()
    
    ## add annotations from gene2go
    gene2goFile = get_gene2go_file()
    gene2goFid = open(gene2goFile,'rU')
    ignoredAnnotationsGene = 0 
    print("...getting annotations from gene2go")
    header = gene2goFid.__next__()
    geneIdMap = gene_mapper(session)
    toAdd = []

    for record in gene2goFid:
        record = record.rstrip("\n")
        record = record.split("\t")

        if re.search("^\#",record[0]) or len(record) != 8:
            continue
    
        taxon = record[0]
        ncbiId = record[1]
        goId = record[2]
        evidenceCode = record[3]
        qualifier = record[4]
        go_term_description = record[5]
        pubmedRefs = record[6]
        go_aspect = record[7]
        annotationCount += 1

        if taxon not in taxaList:
            continue

        if annotationCount in wayPoints:
            print("\t%s / %s"%(annotationCount,totalAnnotations))

        queue_entry(goId,evidenceCode,pubmedRefs,None,ncbiId,taxon,toAdd,
                    geneIdMap,ignoredAnnotationsGene)

        if len(toAdd) >= 100000: #100000
            with engine.begin() as connection:
                connection.execute(GoAnnotation.__table__.insert().
                                   values(toAdd))
            toAdd = []

    print('ignored annotations after gene2go... %s'%(ignoredAnnotationsGene))
    print('committing final changes...')
    
    with engine.begin() as connection:
        connection.execute(GoAnnotation.__table__.insert().
                           values(toAdd))

    timeStr = "...total time taken: %s"%time.strftime('%H:%M:%S', time.gmtime(time.time()-timeStart))
    addedStr = "...%s unique go annotation entries were added."%annotationCount
    return timeStr,addedStr,(ignoredAnnotationsUniprot,ignoredAnnotationsGene)