def var_parser(outputfile, emu_out): '''output: pmid, string, mutation_entry''' '''mutation integration''' with open(outputfile, 'r') as f: mutation_results = [i.split('\n') for i in f.read().split('\n\n') if i] emu = defaultdict(lambda: defaultdict(str)) for pmid, string, norm in emu_to_tmvar(emu_out): emu[int(pmid)][string] = norm for article_result in mutation_results: text = '' pmid = False mutation_entry_ls = [] for record in article_result: is_text = re.match(r'(\d+)\|a\|(.+)$', record) if is_text: # id, string text = is_text.group(2) pmid = int(is_text.group(1)) else: rpmid, start, end, string, Mtype, norm = record.split('\t') start = int(start) end = int(end) if re.search('[A-T]\d+[A-T]', string.upper()): if re.match('c', norm): norm = norm.replace('c', 'p', 1) try: my_nm = normalization.mutation_string_normalizition( string, norm) except ValueError as e: print('{0}\t{1:10}'.format(e, 'skip')) continue mutation_entry_ls.append( Base.BioEntry(start, end, 'mutation', string, norm, my_nm)) for string in emu[pmid].keys(): emu_nm = emu[pmid][string] is_exist = 0 for tmvar_mut in mutation_entry_ls: tm_nm = tmvar_mut.id if emu_nm in tm_nm or tm_nm in emu_nm: # 重复 is_exist = 1 break if not is_exist: '''add mutation entry''' for start, end in __emu_get_pos(string, text): my_nm = normalization.mutation_string_normalizition( string, emu_nm) mutation_entry_ls.append( Base.BioEntry(start, end, 'mutation', string, emu_nm, my_nm)) yield (pmid, text, mutation_entry_ls)
def __GNormPlus_parser_old(outputdir): '''output: pmid, gene_entry, article_part''' gnor = normalization.gene_normor('../data/hgnc_complete_set.txt') for root, dirs, files in os.walk(outputdir): for fn in files: with open(os.path.join(root, fn), 'r') as f: records = [i for i in f.read().split('\n') if i] for r in records: is_text = re.match(r'\d+\|a\|(.+)$', r) if is_text: continue else: pmid, start, end, string, type, ids = r.split('\t') if not re.match('gene|protein', type, re.IGNORECASE): continue pmid = int(pmid) start = int(start) end = int(end) for did in ids.split(';'): '''one tring map to mutil-gene''' try: norm = gnor.norm(did) except KeyError as e: print('gene {0} skip: no gene in hgnc'.format( string)) continue yield (pmid, Base.BioEntry(start, end, type, string, id, norm), fn)
def GNormPlus_parser(outputfile): gnor = normalization.gene_normor('../data/hgnc_complete_set.txt') with open(outputfile, 'r') as f: gene_results = [i.split('\n') for i in f.read().split('\n\n') if i] for article_result in gene_results: text = '' pmid = False mutation_entry_ls = [] for record in article_result: is_text = re.match(r'(\d+)\|a\|(.+)$', record) if is_text: # id, string text = is_text.group(2) pmid = int(is_text.group(1)) else: pmid, start, end, string, _type, ids = record.split('\t') if not re.match('gene|protein', _type, re.IGNORECASE): continue pmid = int(pmid) start = int(start) end = int(end) for did in ids.split(';'): '''one tring map to mutil-gene''' try: norm = gnor.norm(did) except KeyError as e: print('gene {0} skip: no gene in hgnc'.format(string)) continue yield (pmid, Base.BioEntry(start, end, _type, string, id, norm))
def DNorm_parser(outputfile): '''output: pmid, disease_entry''' with open(outputfile, 'r') as f: dnorm_results = [i.split('\t') for i in f.read().split('\n') if i] for i in dnorm_results: if len(i) == 5: pmid, start, end, string, id = i else: continue # pmid, start, end, string = i # id = 'null' pmid = int(pmid) start = int(start) end = int(end) yield (pmid, Base.BioEntry(start, end, 'disease', string, id))
def GNormPlus_parser(outputdir): '''output: pmid, gene_entry, article_part''' for root, dirs, files in os.walk(outputdir): for fn in files: with open(os.path.join(root, fn), 'r') as f: records = [i for i in f.read().split('\n') if i] for r in records: is_text = re.match(r'\d+\|a\|(.+)$', r) if is_text: continue else: pmid, start, end, string, type, id = r.split('\t') if not re.match('gene|protein', type, re.IGNORECASE): continue pmid = int(pmid) start = int(start) end = int(end) yield (pmid, Base.BioEntry(start, end, type, string, id), fn)
def tmvar_parser(outputfile): '''output: pmid, string, mutation_entry''' with open(outputfile, 'r') as f: mutation_results = [i.split('\n') for i in f.read().split('\n\n') if i] for article_result in mutation_results: text = '' pmid = False mutation_entry_ls = [] for record in article_result: is_text = re.match(r'(\d+)\|a\|(.+)$', record) if is_text: # id, string text = is_text.group(2) pmid = int(is_text.group(1)) else: rpmid, start, end, string, Mtype, norm = record.split('\t') start = int(start) end = int(end) mutation_entry_ls.append( Base.BioEntry(start, end, 'mutation', string, norm)) yield (pmid, text, mutation_entry_ls)