def all_files(dataset, mongo): chf = [] indexes = get("all_indexes", dataset=dataset, mongo=mongo) for number_of_card in indexes: print('Card #' + number_of_card) doc_data = create_dict(dataset, number_of_card, mongo) code = get("code.json", formula="CHF", mongo=mongo) all_steps(code, json_interpretator, doc_data, dataset, number_of_card, mongo) put("annotations", doc_data['data'], dataset=dataset, number_of_card=number_of_card, formula ='CHF', mongo=mongo) if 'ICHF' in doc_data['data']: chf.append(number_of_card) put("calculated_indexes", chf, formula='CHF', dataset=dataset, mongo=mongo) print(str(len(chf)) + ' documents were annotated by ICHF.')
def __init__(self, args, mongo, httpd): req = urllib.unquote(args['args']) req = json.loads(req) print('req' + str(req)) lock = httpd.mLock lock.acquire() code = get("code.cla.json", formula=req['formula'], mongo=mongo) print('runCode version: ' + code['version']) lock.release() doc = [] doc_data = {} n = 0 while True: print('Step: ' + str(n)) doc_data = next_step(doc_data, code, req['ds'], req['id'], n, mongo) if doc_data is None: break doc.append(doc_data) # Save to file snap_file = open(snap_file_name, 'w') snap_file.write(json.dumps(doc, indent=4)) snap_file.close() n += 1 answer = 'ready' self.site = urllib.quote(json.dumps(answer, indent=4))
def generator_of_chunks(self, text, mongo, lock): if text[:5] == 'Doc #': number_of_card = text[5:] lock.acquire() nodes = get("doc.html", number_of_card=number_of_card, dataset='cci', mongo=mongo) lock.release() if nodes is None: return doc = '\n'.join(nodes) else: doc = self.split_to_chunks(text, lock) computer = socket.gethostname() if computer == 'noX540LJ': tmp_file = 'tmp/formula.cla' else: tmp_file = '/home/andrey/work/Claudia/claudia/tmp/formula.cla' file = open(tmp_file, 'w') file.write(doc) file.close() try: with open(tmp_file, 'rb') as inp: sHTML_Parser = etree.HTMLParser(remove_comments=True) tree = etree.parse(inp, sHTML_Parser) nodes = tree.xpath('/html/body/p') except IOError: print('No such file or directory: ' + tmp_file) return s_nodes = [] for node in nodes: s_nodes.append(etree.tostring(node)) return s_nodes
def create_dict(dataset, number_of_card, mongo): doc_data = {} doc_data['data'] = {} doc_data['sentences'] = [] #sHTML_Parser = etree.HTMLParser(remove_comments = True) samples = get("doc.html", dataset=dataset, number_of_card=number_of_card, mongo=mongo) for node in samples: sample = etree.fromstring(node) sentence = {} sentence['data'] = {} sentence['chunks'] = [] s = etree.tostring(sample) ss = etree.fromstring(s) for nd_alevel in ss.xpath('/p/span/span'): alevel = nd_alevel.attrib s = etree.tostring(nd_alevel) sss = etree.fromstring(s) for nd in sss.xpath('/span/span/span'): chunk = {} chunk['text'] = nd.text chunk['data'] = {} chunk['data']['__negation'] = alevel['class'][6:] sentence['chunks'].append(chunk) doc_data['sentences'].append(sentence) return doc_data
def taxonomy(text, tax, mongo): #tax = par[0] #mongo = par[1] dict = {} filtre = re.compile("\s+", re.M + re.I + re.U) tax_file = get("tax.tset", taxonomy=tax, mongo=mongo).split('\n') for line in tax_file: if line == "" or line[0] != '"': continue words = line.split('"') triped_text = filtre.sub(' ', text) triped_word = filtre.sub(' ', words[1]) if is_word(triped_word, triped_text): dict[tax] = text flag = True key = '' for word in words[2:]: if flag: flag = False continue flag = True if key == '': key = filtre.sub(' ', word) else: dict[key] = filtre.sub(' ', word) key = '' return dict
def all_files(dataset, formula, mongo): chf = [] apost_res = {} for diag in apostriory: apost_res[diag] = [] #formula = 'CHF' indexes = get("all_indexes", dataset=dataset, mongo=mongo) code = get("code.cla.json", formula=formula, mongo=mongo) for number_of_card in indexes: print('Card #' + number_of_card) doc_data = create_dict(dataset, number_of_card, mongo) all_steps(code, doc_data, dataset, number_of_card, mongo, False) put("annotations", doc_data['data'], dataset=dataset, number_of_card=number_of_card, formula=formula, mongo=mongo) if doc_data['data']['Formula diagnose'] != 'No': chf.append(number_of_card) print(doc_data['data']['Formula diagnose']) if 'value' in doc_data['data'] and doc_data['data'][ 'value'] in apostriory: apost_res[doc_data['data']['value']].append(number_of_card) else: apost_res['not mentioned'].append(number_of_card) put("calculated_indexes", chf, formula=formula, dataset=dataset, mongo=mongo) put("results_apostriory", apost_res, formula=formula, dataset=dataset, mongo=mongo) print('Apostriory: ' + json.dumps(apost_res, indent=4))
def next_step(code, dataset, number_of_card, step_id, mongo): #def next_step(doc_file_name, code_file_name, snap_file_name, step_id): if step_id == 0: doc_data = create_dict(dataset, number_of_card, mongo) doc_data = INA(doc_data, mongo) snapshot(dataset, number_of_card, doc_data, mongo) doc_data = get("snap.json", dataset=dataset, number_of_card=number_of_card, mongo=mongo) if doc_data is None: doc_data = create_dict(dataset, number_of_card, mongo) for step in code['statements']: if 'statementId' in step and step['statementId'] == step_id: doc_data = json_interpretator(doc_data, dataset, number_of_card, step, mongo) break snapshot(dataset, number_of_card, doc_data, mongo)
def __init__(self, args, mongo, httpd): req = urllib.unquote(args['args']) req = json.loads(req) print('req: ' + str(req)) lock = httpd.mLock lock.acquire() if req['ticket'] == 'admin': snap_file = open(snap_file_name, 'r') doc = json.loads(snap_file.read()) snap_file.close() else: cch = httpd.cch #print('Locks: ' + str(cch.mch.mLocks)) doc = cch.getValue(req['ticket']) #print('doc: ' + str(doc)) if doc is None: doc = [] lock.release() if doc == []: doc_data = {} else: doc_data = doc[-1] lock.acquire() code = get("code.cla.json", formula=req['formula'], mongo=mongo) print('GetCode version: ' + code['version']) lock.release() for n in range(len(doc), req['new_step'] + 1): print('Step: ' + str(n)) doc_data = next_step(doc_data, code, req['ds'], req['id'], n, mongo) doc_copy = copy.deepcopy(doc_data) doc.append(doc_copy) new_cadres = doc[req['step'] + 1:] lock.acquire() if req['ticket'] == 'admin': snap_file = open(snap_file_name, 'w') snap_file.write(json.dumps(doc, indent=4)) snap_file.close() else: cch.putValue(req['ticket'], doc) lock.release() self.site = urllib.quote(json.dumps(new_cadres))
def IsNumericAnnotator(text, mongo): dict = {} # Find a measure tax_file = get("tax.tset", taxonomy="DOSAGE", mongo=mongo).split('\n') filtre = re.compile("\s+", re.M + re.I + re.U) for line in tax_file: if line == "" or line[0] != '"': continue words = line.split('"') #end = result.end() triped_text = filtre.sub(' ', text) triped_word = filtre.sub(' ', words[1]) #if is_word(triped_word, triped_text): pos = triped_text.find(triped_word) neib = triped_text[pos - 1:pos] + triped_text[pos + len(triped_word):pos + 1 + len(triped_word)] nn = re.findall(r'[A-z]', neib) if pos != -1 and nn == []: #print('Triped_text: ' + triped_text + ', triped_word: ' + triped_word) dict['measure'] = triped_word # In 'text' there is a number result = re.search(r"[-+]?\d*\.\d+|\d+", text) if result is None: # There are text only # res = re.search(r'[A-z]+', text) # if res is not None and res.group(0) == text: # dict['class'] = 'numeric' # dict['type'] = 'text' # dict['value'] = text return dict dict['class'] = 'numeric' # It is a number #if re.search(r'[A-z]', text) is None: if result.group(0) == text: dict['type'] = 'number' dict['value'] = float(result.group(0)) return dict # It's not a number but contains a number else: dict['type'] = 'contains_number' dict['value'] = float(result.group(0)) #print('INA: text=' + text + ', number=' + str(result.group(0))) return dict
def start_annotate(mongo): for dataset in datasets: indexes = get("all_indexes", dataset=dataset, mongo=mongo) for number_of_card in indexes: print(dataset + ": card #" + number_of_card) key_words = set() doc_data = create_dict(dataset, number_of_card, mongo) for sentence in doc_data["sentences"]: for chunk in sentence["chunks"]: par = (mongo, ) for tax in taxes: dict = taxonomy(chunk["text"], tax, mongo) chunk["data"].update(dict) if dict != {}: key_words.add(tax) if dict != {} and (taxes[tax] == [] or taxes[tax][len(taxes[tax]) - 1] != number_of_card): taxes[tax].append(number_of_card) chunk["data"].update(IsNumericAnnotator( chunk["text"], par)) put("ch.json", doc_data, dataset=dataset, number_of_card=number_of_card, mongo=mongo) put("key_words", list(key_words), dataset=dataset, number_of_card=number_of_card, mongo=mongo) for tax in taxes: put('tax.idx', taxes[tax], taxonomy=tax, dataset=dataset, mongo=mongo)
def __init__(self, args, mongo, httpd): state = urllib.unquote(args['args']) state = json.loads(state) print(json.dumps(state, indent=4)) # lock.acquire() # snap_file = open(snap_file_name, 'w') # snap_file.write(json.dumps([])) # snap_file.close() # lock.release() # Code lock = httpd.mLock lock.acquire() code = get("code.cla.json", formula=state['formula'], mongo=mongo) print('getInfo version: ' + code['version']) lock.release() state['code'] = [] for step in code['source']: command = {} command['text'] = step['text'] command['id'] = step['source_id'] command['changes'] = -1 command['visible'] = False state['code'].append(command) # Key words lock.acquire() state['key_words'] = get('key_words', number_of_card=state['id'], dataset=state['ds'], mongo=mongo) lock.release() # Initilal document lock.acquire() doc = get('doc.html', number_of_card=state['id'], dataset=state['ds'], mongo=mongo) lock.release() state['initial_doc'] = doc # Annotations lock.acquire() doc = get('ch.json', number_of_card=state['id'], dataset=state['ds'], mongo=mongo) lock.release() state['anns'] = doc # Info lock.acquire() info = get('doc.json', number_of_card=state['id'], dataset=state['ds'], mongo=mongo) lock.release() if info is None: info = {} state['info'] = info # Ticket lock.acquire() cch = httpd.cch if state['ticket'] != 'admin': state['ticket'] = cch.getFreeTicket() if state['ticket'] is None: state['ticket'] = 'admin' lock.release() print('ticket: ' + state['ticket']) #print(json.dumps(state, indent=4)) self.site = urllib.quote(json.dumps(state))
def __init__(self, args, mongo, httpd): state = urllib.unquote(args['args']) state = json.loads(state) # Find all cards selected in pivot table lock = httpd.mLock lock.acquire() ids = get('all_indexes', dataset=state['ds'], mongo=mongo) # list_apostriory = get('calculated_indexes', dataset=state['ds'], # formula=state['formula'], mongo=mongo) lock.release() #print('apostriory: ' + str(list_apostriory)) need_list = [] for i in range(len(state['selected_cells'])): lock.acquire() ids3 = get('results_apriory.' + apriory[i], dataset=state['ds'], formula=state['formula'], mongo=mongo) print('Apriory (' + apriory[i] + '): ' + str(ids3)) lock.release() for j in range(len(state['selected_cells'][i])): #if apostriory[j] == 'Other': lock.acquire() list_apostriory = get('results_apostriory.' + apostriory[j], dataset=state['ds'], formula=state['formula'], mongo=mongo) print('Apostriory (' + apostriory[j] + '): ' + str(list_apostriory)) lock.release() # if apostriory[j] == 'Other': # ids2 = difference(ids, list_apostriory) # else: # ids2 = list_apostriory if state['ds'] != 'cci': ids3 = ids #list = intersection(ids2, ids3) list = intersection(list_apostriory, ids3) if state['selected_cells'][i][j]['selected']: need_list = union(need_list, list) state['selected_cells'][i][j]['count'] = len(list) # Find cards with the taxonomy only tax = state['tax'] if tax['tax'] == 'None': ids1 = ids else: lock.acquire() ids1 = get("tax.idx", dataset=state['ds'], taxonomy=tax['tax'], mongo=mongo) lock.release() if not tax['flag']: need_list = intersection(need_list, ids1) else: need_list = difference(need_list, ids1) # Sort list of cards number_list = [] for id in need_list: number_list.append(int(id)) number_list.sort() need_list = [] for number in number_list: need_list.append(str(number)) state['count'] = len(need_list) # Data of every card cards_in_one_portion = 100 chf = {} if state['portion'] * cards_in_one_portion > len(need_list): state['portion'] = 0 cut_need_list = need_list[state['portion'] * cards_in_one_portion:(state['portion'] + 1) * cards_in_one_portion] for stat in apriory: lock.acquire() chf[stat] = get("results_apriory." + stat, dataset=state['ds'], formula=state['formula'], mongo=mongo) lock.release() for id in cut_need_list: card = {} card['id'] = id lock.acquire() card['size'] = str( get("size_of_doc", dataset=state['ds'], number_of_card=id, mongo=mongo)) lock.release() card['diagnosis'] = [] for stat in apriory: if id in chf[stat]: card['diagnosis'].append(state['formula'] + '-' + stat) lock.acquire() abs = get('abstract', dataset=state['ds'], number_of_card=id, mongo=mongo) lock.release() abs = abs.replace('>', '>') abs = abs.replace('<', '<') abs = abs.replace('&', '&') card['abstract'] = abs state['list'].append(card) self.site = urllib.quote(json.dumps(state))
def __init__(self, args, mongo, httpd): print('Run redactor.') # thread = threading.currentThread().getName() # lock = httpd.mLocks[thread] lock = httpd.mLock req = urllib.unquote(args['args']) req = json.loads(req) formula = req['formula'] text = urllib.unquote(req['doc']) ticket = req['ticket'] lock.acquire() code = get('code.cla.json', formula=formula, mongo=mongo) lock.release() state = {} state['formula'] = formula state['step'] = 'Generation of chunks...' lock.acquire() httpd.results[ticket] = state lock.release() print('state: ' + str(state)) doc = self.generator_of_chunks(text, mongo, lock) if doc is None: self.site = 'File ' + text[5:] + ' is not found.' return #formula_name = "Formula was generated by ClaudiaRedactor. " + "Date: Today." state['step'] = 'Compile the formula...' lock.acquire() httpd.results[ticket] = state lock.release() print('state: ' + str(state)) computer = socket.gethostname() if computer == 'noX540LJ' and text[:5] != 'Doc #': self.site = 'It is not a server.' return #code = start_compilator(formula, formula_name) doc_data = create_dict_by_doc(doc) state['count_of_steps'] = code['count_of_steps'] for n in range(code['count_of_steps'] + 1): lock.acquire() state['step'] = 'Apply the formula...' state['current_step'] = n print('Step: ' + str(n)) httpd.results[ticket] = state lock.release() doc_data = next_step(doc_data, code, None, None, n, mongo) #doc_data = for_one_doc(doc, code, mongo, cch, ticket, lock) res = {} res['formula'] = formula if 'value' in doc_data['data']: res['diagnose'] = formula + '-' + doc_data['data']['value'] else: res['diagnose'] = formula + ' - ' + 'not mentioned' if text[:5] == 'Doc #': lock.acquire() js = get('doc.json', number_of_card=text[5:], dataset='cci', mongo=mongo) lock.release() print('state: ' + str(state)) for key in js: if key.find(formula) != -1: res['apriory'] = str(key) res['sentences'] = [] for sentence in doc_data['sentences']: if len(sentence['data']) < 2: continue attr = '' for key in sentence['data']: if key == 'reject': continue attr += key + ': ' + sentence['data'][key] + '; ' sent = {} sent['attr'] = attr sent['sent'] = '' for chunk in sentence['chunks']: sent['sent'] += chunk['text'] + ' ' res['sentences'].append(sent) # results = '<p class="res_paragraph">Diagnose:</p>' # results += '<p>' + formula + '-' + doc_data['data']['value'] + '</p>' # if text[:5] == 'Doc #': # lock.acquire() # js = get('doc.json', number_of_card = text[5:], dataset='cci', mongo = mongo) # lock.release() # print('state: ' + str(state)) # for key in js: # if key.find('CHF') != -1: # results += '<p class="res_paragraph">Apriory:</p>' # results += '<p>' + str(key) + '</p>' # results += '<p class="res_paragraph">Sentences with untrivial annotations:</p>' # for sentence in doc_data['sentences']: # if len(sentence['data']) < 2: # continue # attr = '' # for key in sentence['data']: # if key == 'reject': # continue # attr += key + ': ' + sentence['data'][key] + '; ' # p = '<p class="sentence_attr"><b>Sentence attributes: </b>' + attr + '</p>' # results += p # sent = '' # for chunk in sentence['chunks']: # sent += chunk['text'] + ' ' # p = '<p class="res_sentence">' + sent + '</p>' # results += p # results += '<p> </p>' state['step'] = 'Ready.' state['res'] = res lock.acquire() cch = httpd.cch cch.putValue(ticket, state) lock.release() print('state: ' + '<document>') self.site = urllib.quote(json.dumps(res))
message = 'Line ' + str(ret['line']) + ': ' + ret['message'] print(message) sys.exit() else: code = {} code['rulename'] = claudia_file_name code['version'] = '0.1' code['declarations'] = negation code['statements'] = ret['action'] code['source'] = ret['source'] code['count_of_steps'] = len(ret['steps']) #code['annotations'] = ret['annotations'] for source in code['source']: #print('source-id: ' + str(source['source_id'])) while source['text'].find(' ') != -1: source['text'] = source['text'].replace(' ', ' ') return code if __name__ == '__main__': mongo = connect() for claudia_file_name in ['CHF', 'MI']: claudia = get('code.cla', formula=claudia_file_name, from_file=True) code = start_compilator(claudia, claudia_file_name) file = open('cci/claudia_rules/' + claudia_file_name + '.cla.json', 'w') file.write(json.dumps(code, indent=4)) file.close() put('code.cla.json', code, formula=claudia_file_name, mongo=mongo) print('Ok.')
def create_dict(dataset, patient, mongo): #sHTML_Parser = etree.HTMLParser(remove_comments = True) doc = get("doc.html", dataset=dataset, number_of_card=patient, mongo=mongo) return create_dict_by_doc(doc)