def saveGeneDataToDB(gene_file): current_info_type = '' compound_info = dict() for line in gene_file.readlines(): if line.endswith('\n'): line = line[:-1] if line.startswith('/'): continue infos = line.split(' ') if not line.startswith(' ') and not line.startswith('/'): current_info_type = infos[0] infos = infos[1:] if current_info_type == 'ENTRY': for info_item in infos: if info_item != '': compound_info['ENTRY'] = info_item break if current_info_type == 'NTSEQ': for info_item in infos: if info_item != '': compound_info['NTSEQLEN'] = int(info_item) break if current_info_type == 'ORGANISM': orStr = line.replace('ORGANISM ', '') orList = orStr.split(' ') compound_info['ORGANISM_SHORT'] = orList[0] compound_info['ORGANISM'] = orList[1] for info_item in infos: if info_item.endswith('\n'): info_item = info_item[:-1] if current_info_type == 'ENTRY' or current_info_type == 'ORGANISM': continue if line.startswith('NTSEQ'): continue try: compound_info[current_info_type] += info_item except: compound_info[current_info_type] = info_item try: names = compound_info['NAME'].split(';') compound_info['NAME'] = names[0] compound_info['NICKNAME'] = '' for i in range(1, len(names)): compound_info['NICKNAME'] += names[i] + '_' new_gene = gene(gene_id=compound_info['ENTRY']) new_gene.name = compound_info['NAME'] new_gene.nicknames = compound_info['NICKNAME'] new_gene.definition = compound_info['DEFINITION'] new_gene.organism_short = compound_info['ORGANISM_SHORT'] new_gene.organism = compound_info['ORGANISM'] new_gene.position = compound_info['POSITION'] new_gene.ntseq_length = compound_info['NTSEQLEN'] new_gene.ntseq = compound_info['NTSEQ'] try: new_gene.save() except: traceback.print_exc() print '%s can not be saved' % compound_info['NAME'] except: traceback.print_exc()
def get_or_create_gene(self, gid): """ find gene in database, if found, return gene, or search in ncbi @param gid: gene id @type gid: str @return gene object @rtype: gene """ #get in database try: gene_obj = gene.objects.get(gene_id=gid) return gene_obj except: #get from ncbi baseUrl = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&rettype=fasta&id=' req = urllib2.Request(baseUrl + gid) response = urllib2.urlopen(req) resStr = response.read() gene_dict = parse_fasta_str(resStr) for gn in gene_dict.keys(): gid = gn.split('|')[1] #get detail information new_gene_obj = gene(gene_id=gid) detail_info = self.retrive_gene_detain(gid) if detail_info == None: continue new_gene_obj.name = detail_info['name'] new_gene_obj.definition = detail_info['definition'] new_gene_obj.organism = detail_info['organism'] new_gene_obj.ntseq = gene_dict[gn] new_gene_obj.ntseq_length = len(gene_dict[gn]) try: new_gene_obj.save() return new_gene_obj except: pass return None