def parse_genofile(config, config_dic): genofile = open(config_dic['genofile'], 'r') meta_dic = {} for line in genofile: line = line.strip() if len(line) == 0: continue if line.startswith('#'): continue if line.startswith('@'): line = line.strip('@') items = line.split(';') for item in items: kv = re.split(':|=', item) meta_dic[kv[0].strip()] = kv[1].strip() continue if line.lower().startswith("chr"): # print("geno file meta dictionary:") for k, v in meta_dic.items(): print("\t%s: %s" % (k, v)) # print("geno file head:\n\t%s" % line) strainnames = line.split()[4:] config_dic['strains'] = datastructure.get_strains_bynames(inbredsetid=config_dic['inbredsetid'], strainnames=strainnames, updatestrainxref="yes") continue # geno file line, marker marker_dic = parse_marker(line) marker_dic['genoid'] = check_or_insert_geno(config_dic, marker_dic) if check_genoxref(config_dic, marker_dic): continue insert_genodata(config, config_dic, marker_dic) insert_genoxref(config_dic, marker_dic) config_dic['dataid'] += 1 genofile.close()
def main(argv): # config config = utilities.get_config(argv[1]) print "config:" for item in config.items('config'): print "\t%s" % (str(item)) # var inbredsetid = config.get('config', 'inbredsetid') print "inbredsetid: %s" % inbredsetid species = datastructure.get_species(inbredsetid) speciesid = species[0] print "speciesid: %s" % speciesid # datafile datafile = open(config.get('config', 'datafile'), 'r') data = csv.reader(datafile, delimiter ="\t", quotechar='"') strainnames = data.next() print "strainnames: %s\n\t%s" % (len(strainnames), strainnames) strains = datastructure.get_strains_bynames(inbredsetid=inbredsetid, strainnames=strainnames, updatestrainxref="yes") print("strain: %s\n\t%s" % (len(strains), strains)) datafile.close()
def parse_genofile(params): # genofile with open(params['genofile']) as genofile: meta_data = {} print() # parse genofile for line in genofile: line = line.strip() if not line: pass elif line.startswith('#'): pass elif line.startswith('@'): line = line.strip('@') for item in line.split(';'): kv = re.split(':|=', item) meta_data[kv[0].strip()] = kv[1].strip() elif line.lower().startswith("chr"): print("geno file meta:") for key, value in meta_data.iteritems(): print("\t{}: {}".format(key, value)) print("geno file head:\n\t{}\n".format(line)) strain_names = line.split()[4:] strains = datastructure.get_strains_bynames(inbredsetid=inbredsetid, strain_names=strain_names, updatestrainxref="yes") else: # geno file line marker = Marker(line) # geno_id = check_or_insert_geno(params, marker) if check_genoxref(params): #Check if this can go earlier continue insert_genodata(params) insert_genoxref(params) data_id += 1
def main(argv): # config config = utilities.get_config(argv[1]) print("config:") for item in config.items('config'): print(("\t%s" % (str(item)))) # var inbredsetid = config.get('config', 'inbredsetid') print(("inbredsetid: %s" % inbredsetid)) species = datastructure.get_species(inbredsetid) speciesid = species[0] print(("speciesid: %s" % speciesid)) dataid = datastructure.get_nextdataid_phenotype() print(("next data id: %s" % dataid)) cursor, con = utilities.get_cursor() # datafile datafile = open(config.get('config', 'datafile'), 'r') phenotypedata = csv.reader(datafile, delimiter='\t', quotechar='"') phenotypedata_head = next(phenotypedata) print(("phenotypedata head:\n\t%s" % phenotypedata_head)) strainnames = phenotypedata_head[1:] strains = datastructure.get_strains_bynames(inbredsetid=inbredsetid, strainnames=strainnames, updatestrainxref="yes") # metafile metafile = open(config.get('config', 'metafile'), 'r') phenotypemeta = csv.reader(metafile, delimiter='\t', quotechar='"') phenotypemeta_head = next(phenotypemeta) print(("phenotypemeta head:\n\t%s" % phenotypemeta_head)) print() # load for metarow in phenotypemeta: # datarow_value = next(phenotypedata) datarow_se = next(phenotypedata) datarow_n = next(phenotypedata) # Phenotype sql = """ INSERT INTO Phenotype SET Phenotype.`Pre_publication_description`=%s, Phenotype.`Post_publication_description`=%s, Phenotype.`Original_description`=%s, Phenotype.`Pre_publication_abbreviation`=%s, Phenotype.`Post_publication_abbreviation`=%s, Phenotype.`Lab_code`=%s, Phenotype.`Submitter`=%s, Phenotype.`Owner`=%s, Phenotype.`Authorized_Users`=%s, Phenotype.`Units`=%s """ cursor.execute(sql, ( utilities.to_db_string(metarow[1], None), utilities.to_db_string(metarow[2], None), utilities.to_db_string(metarow[3], None), utilities.to_db_string(metarow[4], None), utilities.to_db_string(metarow[5], None), utilities.to_db_string(metarow[6], None), utilities.to_db_string(metarow[7], None), utilities.to_db_string(metarow[8], None), utilities.to_db_string(metarow[9], ""), utilities.to_db_string(metarow[18], ""), )) rowcount = cursor.rowcount phenotypeid = con.insert_id() print(("INSERT INTO Phenotype: %d record: %d" % (rowcount, phenotypeid))) # Publication publicationid = None # reset pubmed_id = utilities.to_db_string(metarow[0], None) if pubmed_id: sql = """ SELECT Publication.`Id` FROM Publication WHERE Publication.`PubMed_ID`=%s """ cursor.execute(sql, (pubmed_id)) re = cursor.fetchone() if re: publicationid = re[0] print(("get Publication record: %d" % publicationid)) if not publicationid: sql = """ INSERT INTO Publication SET Publication.`PubMed_ID`=%s, Publication.`Abstract`=%s, Publication.`Authors`=%s, Publication.`Title`=%s, Publication.`Journal`=%s, Publication.`Volume`=%s, Publication.`Pages`=%s, Publication.`Month`=%s, Publication.`Year`=%s """ cursor.execute(sql, ( utilities.to_db_string(metarow[0], None), utilities.to_db_string(metarow[12], None), utilities.to_db_string(metarow[10], ""), utilities.to_db_string(metarow[11], None), utilities.to_db_string(metarow[13], None), utilities.to_db_string(metarow[14], None), utilities.to_db_string(metarow[15], None), utilities.to_db_string(metarow[16], None), utilities.to_db_string(metarow[17], ""), )) rowcount = cursor.rowcount publicationid = con.insert_id() print(("INSERT INTO Publication: %d record: %d" % (rowcount, publicationid))) # data for index, strain in enumerate(strains): # strainid = strain[0] value = utilities.to_db_float(datarow_value[index+1], None) se = utilities.to_db_float(datarow_se[index+1], None) n = utilities.to_db_int(datarow_n[index+1], None) # if value is not None: sql = """ INSERT INTO PublishData SET PublishData.`Id`=%s, PublishData.`StrainId`=%s, PublishData.`value`=%s """ cursor.execute(sql, (dataid, strainid, value)) if se is not None: sql = """ INSERT INTO PublishSE SET PublishSE.`DataId`=%s, PublishSE.`StrainId`=%s, PublishSE.`error`=%s """ cursor.execute(sql, (dataid, strainid, se)) if n is not None: sql = """ INSERT INTO NStrain SET NStrain.`DataId`=%s, NStrain.`StrainId`=%s, NStrain.`count`=%s """ cursor.execute(sql, (dataid, strainid, n)) # PublishXRef sql = """ INSERT INTO PublishXRef SET PublishXRef.`InbredSetId`=%s, PublishXRef.`PhenotypeId`=%s, PublishXRef.`PublicationId`=%s, PublishXRef.`DataId`=%s, PublishXRef.`comments`=%s """ cursor.execute(sql, (inbredsetid, phenotypeid, publicationid, dataid, "")) rowcount = cursor.rowcount publishxrefid = con.insert_id() print(("INSERT INTO PublishXRef: %d record: %d" % (rowcount, publishxrefid))) # for loop next dataid += 1 print() # release con.close()
def main(argv): # config config = utilities.get_config(argv[1]) print "config:" for item in config.items('config'): print "\t%s" % (str(item)) # var inbredsetid = config.get('config', 'inbredsetid') print "inbredsetid: %s" % inbredsetid species = datastructure.get_species(inbredsetid) speciesid = species[0] print "speciesid: %s" % speciesid dataid = datastructure.get_nextdataid_phenotype() print "next data id: %s" % dataid cursor, con = utilities.get_cursor() # datafile datafile = open(config.get('config', 'datafile'), 'r') phenotypedata = csv.reader(datafile, delimiter='\t', quotechar='"') phenotypedata_head = phenotypedata.next() print("phenotypedata head [%d]:\n\t%s" % (len(phenotypedata_head), phenotypedata_head)) strainnames = phenotypedata_head[1:] strains = datastructure.get_strains_bynames(inbredsetid=inbredsetid, strainnames=strainnames, updatestrainxref="yes") # metafile metafile = open(config.get('config', 'metafile'), 'r') phenotypemeta = csv.reader(metafile, delimiter='\t', quotechar='"') phenotypemeta_head = phenotypemeta.next() print("phenotypemeta head [%d]:\n\t%s" % (len(phenotypemeta_head), phenotypemeta_head)) print # load for metarow in phenotypemeta: # datarow_value = phenotypedata.next() datarow_se = phenotypedata.next() datarow_n = phenotypedata.next() # Phenotype sql = """ INSERT INTO Phenotype SET Phenotype.`Pre_publication_description`=%s, Phenotype.`Post_publication_description`=%s, Phenotype.`Original_description`=%s, Phenotype.`Pre_publication_abbreviation`=%s, Phenotype.`Post_publication_abbreviation`=%s, Phenotype.`Lab_code`=%s, Phenotype.`Submitter`=%s, Phenotype.`Owner`=%s, Phenotype.`Authorized_Users`=%s, Phenotype.`Units`=%s """ cursor.execute(sql, ( utilities.to_db_string(metarow[1], None), utilities.to_db_string(metarow[2], None), utilities.to_db_string(metarow[3], None), utilities.to_db_string(metarow[4], None), utilities.to_db_string(metarow[5], None), utilities.to_db_string(metarow[6], None), utilities.to_db_string(metarow[7], None), utilities.to_db_string(metarow[8], None), utilities.to_db_string(metarow[9], ""), utilities.to_db_string(metarow[18], ""), )) rowcount = cursor.rowcount phenotypeid = con.insert_id() print "INSERT INTO Phenotype: %d record: %d" % (rowcount, phenotypeid) # Publication publicationid = None # reset pubmed_id = utilities.to_db_string(metarow[0], None) if pubmed_id: sql = """ SELECT Publication.`Id` FROM Publication WHERE Publication.`PubMed_ID`=%s """ cursor.execute(sql, (pubmed_id)) re = cursor.fetchone() if re: publicationid = re[0] print "get Publication record: %d" % publicationid if not publicationid: sql = """ INSERT INTO Publication SET Publication.`PubMed_ID`=%s, Publication.`Abstract`=%s, Publication.`Authors`=%s, Publication.`Title`=%s, Publication.`Journal`=%s, Publication.`Volume`=%s, Publication.`Pages`=%s, Publication.`Month`=%s, Publication.`Year`=%s """ cursor.execute(sql, ( utilities.to_db_string(metarow[0], None), utilities.to_db_string(metarow[12], None), utilities.to_db_string(metarow[10], ""), utilities.to_db_string(metarow[11], None), utilities.to_db_string(metarow[13], None), utilities.to_db_string(metarow[14], None), utilities.to_db_string(metarow[15], None), utilities.to_db_string(metarow[16], None), utilities.to_db_string(metarow[17], ""), )) rowcount = cursor.rowcount publicationid = con.insert_id() print "INSERT INTO Publication: %d record: %d" % (rowcount, publicationid) # data for index, strain in enumerate(strains): # strainid = strain[0] value = utilities.to_db_float(datarow_value[index+1], None) se = utilities.to_db_float(datarow_se[index+1], None) n = utilities.to_db_int(datarow_n[index+1], None) # if value is not None: sql = """ INSERT INTO PublishData SET PublishData.`Id`=%s, PublishData.`StrainId`=%s, PublishData.`value`=%s """ cursor.execute(sql, (dataid, strainid, value)) if se is not None: sql = """ INSERT INTO PublishSE SET PublishSE.`DataId`=%s, PublishSE.`StrainId`=%s, PublishSE.`error`=%s """ cursor.execute(sql, (dataid, strainid, se)) if n is not None: sql = """ INSERT INTO NStrain SET NStrain.`DataId`=%s, NStrain.`StrainId`=%s, NStrain.`count`=%s """ cursor.execute(sql, (dataid, strainid, n)) # PublishXRef sql = """ INSERT INTO PublishXRef SET PublishXRef.`InbredSetId`=%s, PublishXRef.`PhenotypeId`=%s, PublishXRef.`PublicationId`=%s, PublishXRef.`DataId`=%s, PublishXRef.`comments`=%s """ cursor.execute(sql, (inbredsetid, phenotypeid, publicationid, dataid, "")) rowcount = cursor.rowcount publishxrefid = con.insert_id() print "INSERT INTO PublishXRef: %d record: %d" % (rowcount, publishxrefid) # for loop next dataid += 1 print # release con.close()