def parsing_tsv_to_manual_mapped(tsv_file, icd2umls_file): icd2umls = {} for l in utils.read_text_file(icd2umls_file): cols = l.split('\t') icd2umls[cols[0]] = cols[1] lines = utils.read_text_file(tsv_file) condition2code = {} for l in lines: cols = l.split('\t') c = cols[0] icds = cols[len(cols) - 1].split(',') concepts = [] condition2code[c] = concepts for icd in icds: icd = icd.strip().upper() icd_codes = [] m = re.match(r'([A-Z])(\d+)\-[A-Z]{0,1}(\d+)', icd) if m is not None: logging.info('range mappings: %s' % m.group(0)) for num in range(int(m.group(2)), int(m.group(3)), 1): icd = '%s%02d' % (m.group(1), num) icd_codes.append(icd) else: icd_codes.append(icd) concepts += process_icd_to_umls(icd_codes, icd2umls=icd2umls) logging.info(json.dumps(condition2code))
def patient_level_analysis(complete_anns_file, output_file): lines = utils.read_text_file(complete_anns_file) pos_condition2patients = {} patient2conditions = {} positive_labels = ['posM', 'hisM'] indexable_labels = ['posM', 'hisM', 'negM'] for l in lines: arr = l.split('\t') label = arr[2] condition = arr[3] pid = arr[8] if label in positive_labels: pos_condition2patients[condition] = [pid] if condition not in pos_condition2patients else \ pos_condition2patients[condition] + [pid] if label in indexable_labels: pd = patient2conditions[pid] if pid in patient2conditions else {} patient2conditions[pid] = pd if label in pd: pd[label].append(condition) pd[label] = list(set(pd[label])) else: pd[label] = [condition] utils.save_json_array( { 'p2c': patient2conditions, 'c2p': pos_condition2patients }, output_file)
def load_gaz_dir(self, gaz_dir): files = [f for f in listdir(gaz_dir) if isfile(join(gaz_dir, f))] for f in files: if f.endswith('.lst'): t = f.split('.')[0] self._type2gaz[t] = utils.read_text_file(join(gaz_dir, f)) self._all_entities += [t.lower() for t in self._type2gaz[t]]
def create_instances(): results = {"instances":[]} HOST_FILE_PATH = ( pathlib .Path(__file__) .absolute() .parent .parent .parent/"hosts.txt" ) addresses = read_text_file(HOST_FILE_PATH) names = list(random_name.generate(len(addresses))) #creating the public instance public = { "instance_name": names.pop(), "ip": addresses.pop(), "is_public": True } public_instance = Instance(config,public) results["instances"].append(str(public_instance)) #creating the rest of the instances for name , ip in zip(names,addresses): private = { "instance_name": name, "ip": ip, "is_public": False } results["instances"].append(str(Instance(config,private))) return results
def run_learning( train_ann_dir, train_gold_dir, train_text_dir, test_ann_dir, test_gold_dir, test_text_dir, settings): log_level = 'DEBUG' log_format = '[%(filename)s:%(lineno)d] %(name)s %(asctime)s %(message)s' logging.basicConfig(level='DEBUG', format=log_format) log_file = './settings/processing.log' logging.basicConfig(level=log_level, format=log_format) global _min_sample_size, _ann_dir, _gold_dir, _test_ann_dir, _test_gold_dir, _gold_text_dir, _test_text_dir, _concept_mapping, _learning_model_dir global _labels, _gold_file_pattern, _ignore_mappings, _eHostGD, _cm_obj global _annotated_anns _annotated_anns = {} _min_sample_size = settings['min_sample_size'] _ann_dir = train_ann_dir _gold_dir = train_gold_dir _test_ann_dir = test_ann_dir _test_gold_dir = test_gold_dir _gold_text_dir = train_text_dir _test_text_dir = test_text_dir _concept_mapping = settings['concept_mapping_file'] _learning_model_dir = settings['learning_model_dir'] _labels = utils.read_text_file(settings['entity_types_file']) _gold_file_pattern = "%s_ann.xml" if 'gold_file_pattern' not in settings else settings['gold_file_pattern'] _ignore_mappings = utils.load_json_data(settings['ignore_mapping_file']) _eHostGD = settings['eHostGD'] if 'eHostGD' in settings else False _cm_obj = Concept2Mapping(_concept_mapping) # not using mention patterns for prediction as this is only a in-development feature mp_inst = None return do_learn_exp(settings['viz_file'], num_dimensions=[50], ignore_context=settings['ignore_context'] if 'ignore_context' in settings else False, separate_by_label=True, conll_output_file=settings['conll_output_file'], eHostGD=_eHostGD, mention_pattern=mp_inst)
def direct_nlp_prediction(settings): ann_dir = settings['test_ann_dir'] test_text_dir = settings['test_fulltext_dir'] _concept_mapping = settings['concept_mapping_file'] _learning_model_dir = settings['learning_model_dir'] _labels = utils.read_text_file(settings['entity_types_file']) ignore_mappings = utils.load_json_data(settings['ignore_mapping_file']) _cm_obj = Concept2Mapping(_concept_mapping) file_keys = [ f.split('.')[0] for f in listdir(ann_dir) if isfile(join(ann_dir, f)) ] doc2predicted = {} for fk in file_keys: cr = CustomisedRecoginiser(join(ann_dir, '%s.json' % fk), _concept_mapping) d = cr.full_text_file_pattern % fk for ann in cr.annotations: if ann.cui in _concept_mapping.cui2label: lbl = _concept_mapping.cui2label[ann.cui] pheAnn = PhenotypeAnn(ann.str, ann.start, ann.end, ann.negation, ann.temporality, ann.experiencer, 'StudyName', lbl) put_ann_label(lbl, pheAnn, doc2predicted, d) for ann in cr.phenotypes: put_ann_label(ann.minor_type, ann, doc2predicted, d) return doc2predicted
def phenotype_counting(phenotype_def, concept_level_results, output_file): pd = utils.load_json_data(phenotype_def) npd = {} cd = utils.read_text_file(concept_level_results) c_headers = cd[0].split('\t') headers = [h for h in c_headers[2:len(c_headers) - 1]] for r in cd[1:]: arr = r.split('\t') c = arr[0] num_mention = arr[12] for p in pd: if c in pd[p]['concepts']: po = npd[p] if p in npd else {'freq':0, 'p': p, 'num_concepts': len(pd[p]['concepts'])} npd[p] = po po['freq'] += int(num_mention) for idx in xrange(2, len(arr) - 1): h = headers[idx-2] po[h] = int(arr[idx]) if h not in po else (int(arr[idx]) + int(po[h])) rows = ['\t'.join(['phenotype', 'num concepts'] + headers + ['prevalence'])] for p in npd: po = npd[p] rows.append('\t'.join([p, str(po['num_concepts'])] + [str(po[h]) for h in headers] + [str(po['freq'])])) utils.save_string('\n'.join(rows), output_file)
def generate_hpo_umls_mapping(hpo_dump): lines = utils.read_text_file(hpo_dump) # lines = [u'id: HP:3000076', u'def: "An abnormality', u'xref: UMLS:C4073283'] maps = [] cur_map = None for l in lines: m = re.match(r'^id\: (HP\:\d+)', l) if m is not None: print 'start with %s' % m.group(1) cur_map = {'hp': m.group(1), 'cuis': []} maps.append(cur_map) m = re.match(r'^xref: (UMLS:C\d+)', l) if m is not None: cur_map['cuis'].append(m.group(1)) if l == 'is_obsolete: true': cur_map['is_obsolete'] = True m = re.match(r'^replaced_by: (HP:\d+)', l) if m is not None: cur_map['replaced_by'] = m.group(1) hpo2umls = {} obsolete2replace = {} for cur_map in maps: hpo2umls[cur_map['hp']] = cur_map['cuis'] if cur_map['hp'] not in hpo2umls \ else cur_map['cuis'] + hpo2umls[cur_map['hp']] if 'is_obsolete' in cur_map and 'replaced_by' in cur_map: obsolete2replace[cur_map['hp']] = cur_map['replaced_by'] for obs in obsolete2replace: if obsolete2replace[obs] in hpo2umls: hpo2umls[obs] = hpo2umls[obsolete2replace[obs]] print json.dumps(hpo2umls)
def run_study(folder_path, no_sql_filter=None): study_config = 'study.json' if no_sql_filter is None else 'study_no_filter.json' if isfile(join(folder_path, study_config)): r = utils.load_json_data(join(folder_path, study_config)) retained_patients = None if 'query_patients_file' in r: retained_patients = [] lines = utils.read_text_file(r['query_patients_file']) for l in lines: arr = l.split('\t') retained_patients.append(arr[0]) study(folder_path, r['cohort'], r['sql_config'], r['db_conn'], concept_mapping.get_umls_client_inst(r['umls_key']), do_preprocessing=r['do_preprocessing'], rule_setting_file=r['rule_setting_file'], do_one_iter=r['do_one_iter'], sem_idx_setting_file=None if 'sem_idx_setting_file' not in r else r['sem_idx_setting_file'], concept_filter_file=None if 'concept_filter_file' not in r else r['concept_filter_file'], retained_patients_filter=retained_patients, filter_obj_setting=None if 'filter_obj_setting' not in r else r['filter_obj_setting'], do_disjoint_computing=True if 'do_disjoint' not in r else r['do_disjoint'], export_study_concept_only=False if 'export_study_concept' not in r else r['export_study_concept']) else: print 'study.json not found in the folder'
def copy_docs(index_setting_file, src_index, src_doc_type, entity_id_field_name, dest_index, dest_doc_type, patient_list_file, thread_num=30): """ copy a list of docs (doc ids read from doc_list_file) from one index to another :param index_setting_file: :param src_index: :param src_doc_type: :param entity_id_field_name: :param dest_index: :param dest_doc_type: :param patient_list_file: :param thread_num: :return: """ es = EntityCentricES.get_instance(index_setting_file) patients = utils.read_text_file(patient_list_file) utils.multi_thread_tasking(patients, thread_num, do_copy_doc, args=[ es, src_index, src_doc_type, entity_id_field_name, dest_index, dest_doc_type ]) print 'all done'
def load_document_to_es(settings): """ load document to elastic search :param settings: :return: """ doc_folder = settings.get_attr(['epr_index', 'doc_folder']) d2p_tsv = settings.get_attr(['epr_index', 'doc2patient_tsv']) es = SemEHRES.get_instance_by_setting( settings.get_attr(['epr_index', 'es_host']), settings.get_attr(['epr_index', 'es_index_name']), settings.get_attr(['epr_index', 'doc_type']), '', '') tsv_lines = utils.read_text_file(d2p_tsv) d2p = {} for l in tsv_lines: arr = l.split('\t') if len(arr) > 1: d2p[arr[0]] = arr[1] for f in [f for f in listdir(doc_folder) if isfile(join(doc_folder, f))]: if f in d2p: p = d2p[f] t = utils.read_text_file_as_string(join(doc_folder, f)) es.index_new_doc( index=settings.get_attr(['epr_index', 'es_index_name']), doc_type=settings.get_attr(['epr_index', 'doc_type']), data={ settings.get_attr(['epr_index', 'text_field']): t, settings.get_attr(['epr_index', 'patient_id_field']): p, "id": f }, doc_id=f)
def index_cris_cohort(): f_patient_doc = './hepc_pos_doc_brcid.txt' f_yodie_anns = 'U:/kconnect/hepc_output/' print 'loading all docs at a time...' docs = load_all_docs() print 'docs read' doc_dict = {} for d in docs: doc_dict[d['CN_Doc_ID']] = d es = EntityCentricES.get_instance('./index_settings/es_cris_setting.json') lines = utils.read_text_file(f_patient_doc, encoding='utf-8-sig') doc_to_patient = {} for l in lines: arr = l.split('\t') doc_to_patient[arr[1]] = arr[0] container = [] ann_files = [ f for f in listdir(f_yodie_anns) if isfile(join(f_yodie_anns, f)) ] for ann in ann_files: utils.multi_thread_large_file_tasking( join(f_yodie_anns, ann), 20, do_index_cris, args=[es, doc_to_patient, doc_dict, container], file_encoding='iso-8859-1') print 'file %s [%s] done' % (ann, len(container)) print 'num done %s' % len(container) print 'done'
def regenerate_manual_mapped_concepts(tsv, closure_file): selected_concepts = set() c2l = {} for l in utils.read_text_file(tsv): arr = l.split('\t') selected_concepts.add(arr[1]) c2l[arr[1]] = arr[0] t2closure = utils.load_json_data(closure_file) mapped_concepts = [] map = {} v_map = {} for t in t2closure: disjoint_list = list(set(t2closure[t]) & selected_concepts) if len(disjoint_list) > 0: mapped_concepts += disjoint_list map[t] = { "tc": { "closure": len(disjoint_list), "mapped": disjoint_list[0] }, "concepts": disjoint_list } v_map[t] = [('%s [%s]' % (c2l[c], c)) for c in disjoint_list] print json.dumps(map) print selected_concepts - set(mapped_concepts) print json.dumps(v_map)
def direct_nlp_prediction(settings): ann_dir = settings['test_ann_dir'] test_text_dir = settings['test_fulltext_dir'] _concept_mapping = settings['concept_mapping_file'] _learning_model_dir = settings['learning_model_dir'] _labels = utils.read_text_file(settings['entity_types_file']) ignore_mappings = utils.load_json_data(settings['ignore_mapping_file']) _cm_obj = Concept2Mapping(_concept_mapping) file_keys = [ f[:f.rfind('.')].replace('se_ann_', '') for f in listdir(ann_dir) if isfile(join(ann_dir, f)) ] doc2predicted = {} for fk in file_keys: cr = CustomisedRecoginiser(join(ann_dir, 'se_ann_%s.json' % fk), _concept_mapping) d = fk for ann in cr.annotations: if ann.cui in _cm_obj.concept2label: lbl = _cm_obj.concept2label[ann.cui][0] pheAnn = PhenotypeAnn(ann.str, ann.start, ann.end, ann.negation, ann.temporality, ann.experiencer, 'StudyName', lbl) if ann.negation != 'Affirmed' or len(ann.ruled_by) > 0: continue put_ann_label(lbl, pheAnn, doc2predicted, d) for ann in cr.phenotypes: put_ann_label(ann.minor_type, ann, doc2predicted, d) return doc2predicted
def merge_mappings_dictionary(map_files, dict_dirs, new_map_file, new_dict_folder): maps = [utils.load_json_data(mf) for mf in map_files] new_m = {} for m in maps: new_m.update(m) t2list = {} for dd in dict_dirs: lst_files = [ f for f in listdir(dd) if isfile(join(dd, f)) and f.endswith('.lst') ] for f in lst_files: t = f[:f.index('.')] labels = utils.read_text_file(join(dd, f)) if t not in t2list: t2list[t] = set() for l in labels: if len(l) > 0: t2list[t].add(l) utils.save_json_array(new_m, new_map_file) logging.info('mapping saved to %s' % new_map_file) for t in t2list: utils.save_string('\n'.join(list(t2list[t])) + '\n', join(new_dict_folder, t + '.lst')) logging.info('%s.lst saved' % t) logging.info('all done')
def load_corpus_to_FHIR_mapping(tsv_map_file): lines = utils.read_text_file(tsv_map_file) sec_to_fhir = {} for l in lines: arr = l.split('\t') for i in range(1, len(arr)): sec_to_fhir[arr[i]] = arr[0] return sec_to_fhir
def load_patient_date(patient_date_file): lines = utils.read_text_file(patient_date_file) print 'patient file read. parsing...' p2time = {} for l in lines[1:]: arr = l.split(',') p2time[arr[0]] = datetime.strptime(arr[1], '%d/%m/%Y') return p2time
def load_doc_date(doc_date_file): lines = utils.read_text_file(doc_date_file) print 'doc file read. parsing...' d2time = {} for l in lines[1:]: arr = l.split('\t') d2time[arr[0]] = datetime.strptime(arr[2], '%Y-%m-%d %H:%M:%S.%f') return d2time
def complete_tsv_concept_label(umls, tsv_file): lines = [] for l in utils.read_text_file(tsv_file): arr = l.split('\t') print arr arr.insert(1, get_umls_concept_detail(umls, arr[1])['result']['name']) lines.append(arr) print '\n'.join(['\t'.join(l) for l in lines])
def predict_exp(corpus_trans_file, ann_file, cache_file, output_file): # initialise pattern instances from documents if not isfile(cache_file): # load labelled data ann_lines = utils.read_text_file(ann_file) prev_doc = None anns = [] doc_anns = [] ptn_insts = [] doc_to_pt = {} for ls in ann_lines: l = ls.split('\t') doc_id = l[1] doc_to_pt[doc_id] = l[0] if prev_doc != doc_id: if prev_doc is not None: if exists(join(working_folder, 'docs', '%s.txt' % prev_doc)): doc_anns.append((prev_doc, anns)) anns = [] prev_doc = doc_id anns.append({ 's': int(l[2]), 'e': int(l[3]), 'signed_label': l[4], 'gt_label': l[5] }) if prev_doc is not None: if exists(join(working_folder, 'docs', '%s.txt' % prev_doc)): doc_anns.append((prev_doc, anns)) # mutithreading do processing labelled docs print 'processing docs...' utils.multi_thread_tasking(doc_anns, 30, do_process_labelled_doc, args=[ptn_insts]) jl.dump({'insts': ptn_insts, 'doc_to_pt': doc_to_pt}, cache_file) else: cached = jl.load(cache_file) ptn_insts = cached['insts'] doc_to_pt = cached['doc_to_pt'] cp = sp.CorpusPredictor.load_corpus_model(corpus_trans_file) ret = [] for inst in ptn_insts: print 'predicting [%s]...' % inst.sentence acc = cp.predcit(inst) print 'accuracy: %s' % acc ann = inst.annotations[0] ret.append( (doc_to_pt[inst.doc_id], inst.doc_id, str(ann['s']), str(ann['e']), ann['signed_label'], ann['gt_label'], str(acc))) s = [] for r in ret: s.append(u'\t'.join(r)) print u'\n'.join(s) utils.save_json_array(ret, output_file) return ret
def load_tree_node_file(file_path, node_name=None): if node_name is None: p, fn = split(file_path) node_name = fn folder_node = create_folder_node('\\', node_name) for l in utils.read_text_file(file_path): leaf = create_leaf_node(folder_node['path'], l.split('\t')[0]) folder_node['children'].append(leaf) return folder_node
def align_mapped_concepts(map_file, disorder_file): concept_map = utils.load_json_data(map_file) disorders = [d.strip() for d in utils.read_text_file(disorder_file)] exact_mapped = {} for d in disorders: if d in concept_map: exact_mapped[d] = concept_map[d] else: exact_mapped[d] = "" print json.dumps(exact_mapped)
def produce_weka_output(predict_output_file, orig_features_file, merged_output_file, arrf_file, threshold=.70, mode='threshold'): orig_data_lines = utils.read_text_file(orig_features_file) ret = utils.load_json_data(predict_output_file) ptn2anns = {} for r in ret: ptn = r[0] if ptn not in ptn2anns: ptn2anns[ptn] = {'posM': 0, 'negM': 0, 'hisM': 0, 'otherM': 0} if mode == 'threshold': if float(r[6]) >= threshold: ptn2anns[ptn][r[4]] += 1 elif mode == 'weighted_sum': ptn2anns[ptn][r[4]] += float(r[6]) rows = [] arrf_header = """@RELATION hepc @ATTRIBUTE Total_Mentions NUMERIC @ATTRIBUTE Positive_Mentions NUMERIC @ATTRIBUTE History_hypothetical_Mentions NUMERIC @ATTRIBUTE Negative_Mentions NUMERIC @ATTRIBUTE Other_Experiencers NUMERIC @ATTRIBUTE AT_Total_Mentions NUMERIC @ATTRIBUTE AT_Positive_Mentions NUMERIC @ATTRIBUTE AT_History_hypothetical_Mentions NUMERIC @ATTRIBUTE AT_Negative_Mentions NUMERIC @ATTRIBUTE AT_Other_Experiencers NUMERIC @ATTRIBUTE class {positive,negative,unknown} @DATA """ arrf_rows = [] for l in orig_data_lines: arr = l.split('\t') ptn = arr[0] new_line = arr[:6] + \ ([str(ptn2anns[ptn]['posM'] + ptn2anns[ptn]['negM'] + ptn2anns[ptn]['hisM'] + ptn2anns[ptn]['otherM']), str(ptn2anns[ptn]['posM']), str(ptn2anns[ptn]['hisM']), str(ptn2anns[ptn]['negM']), str(ptn2anns[ptn]['otherM'])] if ptn in ptn2anns else ['0','0','0','0','0']) + \ [arr[6]] rows.append(new_line) arrf_rows.append(','.join(new_line[1:])) utils.save_string(arrf_header + '\n'.join(arrf_rows), arrf_file) utils.save_string('\n'.join(['\t'.join(r) for r in rows]), merged_output_file)
def load_episode_data(file_path, date_format='%d/%m/%Y %H:%M'): lines = utils.read_text_file(file_path) eps = [] for l in lines: arr = l.split('\t') eps.append({'brcid': arr[0], 'win1': {'s': datetime.strptime(arr[1], date_format), 'e': datetime.strptime(arr[2], date_format)}, 'win2': {'s': datetime.strptime(arr[3], date_format), 'e': datetime.strptime(arr[4], date_format)}, 'win3': {'s': datetime.strptime(arr[5], date_format), 'e': datetime.strptime(arr[6], date_format)} }) return eps
def load_patient_truth(truth_file): all_pids = [] lines = utils.read_text_file(truth_file) type2ids = {} for l in lines: arr = l.split('\t') if arr[2] not in type2ids: type2ids[arr[2]] = [] type2ids[arr[2]].append(arr[0]) all_pids.append(arr[0]) return type2ids, all_pids
def extract_doc_level_ann(ann_dump, output_folder): """ extract doc level annotations and save to separate files :param ann_dump: :param output_folder: :return: """ lines = utils.read_text_file(ann_dump) for l in lines: doc_ann = json.loads(l) utils.save_string(l, join(output_folder, doc_ann['docId'].split('.')[0] + '.json'))
def index_cris_patients(): f_patient_doc = './hepc_pos_doc_brcid.txt' lines = utils.read_text_file(f_patient_doc, encoding='utf-8-sig') patients = [] for l in lines: arr = l.split('\t') if arr[0] not in patients: patients.append(arr[0]) print 'total patients %s %s' % (len(patients), patients[0]) es = EntityCentricES.get_instance('./index_settings/es_cris_setting.json') utils.multi_thread_tasking(patients, 10, do_index_patient, args=[es]) print 'done'
def process_labelled_docs(labelled_file, corpus_model_file, mini_comp_file): corpus_analyzer = None if not isfile(corpus_model_file): # load labelled data ann_lines = utils.read_text_file(labelled_file) prev_doc = None anns = [] doc_anns = [] ptn_insts = [] for ls in ann_lines: l = ls.split('\t') doc_id = l[0] if prev_doc != doc_id: if prev_doc is not None: if exists(join(working_folder, 'docs', '%s.txt' % prev_doc)): doc_anns.append((prev_doc, anns)) anns = [] prev_doc = doc_id anns.append({ 's': int(l[1]), 'e': int(l[2]), 'signed_label': l[3], 'gt_label': l[4] }) if prev_doc is not None: if exists(join(working_folder, 'docs', '%s.txt' % prev_doc)): doc_anns.append((prev_doc, anns)) # mutithreading do processing labelled docs print 'processing docs...' utils.multi_thread_tasking(doc_anns, 30, do_process_labelled_doc, args=[ptn_insts]) print 'merging patterns..' corpus_analyzer = sp.CorpusAnalyzer() for pi in ptn_insts: corpus_analyzer.add_pattern(pi) corpus_analyzer.serialise(corpus_model_file) else: corpus_analyzer = sp.CorpusAnalyzer.load_seralisation( corpus_model_file) # corpus_analyzer.show() # pt_insts = corpus_analyzer.pattern_to_insts if isfile(mini_comp_file): corpus_analyzer.load_mini_comp_dict(mini_comp_file) else: corpus_analyzer.produce_save_comp_dict(mini_comp_file) corpus_analyzer.show_mini_comp_patterns() # generate_corpus_model(corpus_analyzer) return corpus_analyzer
def run(self, quota, **query_kwargs): for t in range(quota): if t % 10 == 0 and t != 0: print("Query# {}".format(t)) #train model with current labeled train data X_train, y_train, _ = self.dataset.get_labeled_data('train') self.model.fit(X_train, y_train) #make active query ask_id = self.qs.make_query(self.model, self.dataset, **query_kwargs) # check current prediction for this id and add user given if self.dataset.mode == 'active': # prompt user here to label by showing document text if not self.docid2path: raise ValueError('document paths are required for interactive learning') doctext = utils.read_text_file(self.docid2path[ask_id]) print("================================================") print(" Document Text ") print("================================================") print(doctext) print() true_class_ask_id = int(input('DOCUMENT LABEL (Hint: {}): '.format(self.y_ideal[ask_id]))) _ = os.system('cls') else: true_class_ask_id = self.dataset.ask_label(ask_id) # accumulate user labeled example for validation set (if set) if self.progressive_validation and t % self.pr_rate == 0 and t > 1: self.dataset.update(ask_id, true_class_ask_id, 'valid') else: # update model and dataset self.dataset.update(ask_id, true_class_ask_id, 'query', t=t) self.relevant_found = sum([1 for props in self.dataset.index2props.values() if props.y_true == 1]) if t > 1 and t % self.eval_at == 0: self.supervised_eval('train') self.eval_xs['train'].append(t) if self.dataset.mode: self.active_simulation_eval() self.eval_xs['simulate'].append(t) if self.progressive_validation: # if we have at least 5 positive classes in validation, start evaluation pos_val_count = sum([1 for props in self.dataset.index2props.values() if props.is_valid and props.y_true == 1]) if pos_val_count >= 5: self.supervised_eval('valid') self.eval_xs['valid'].append(t)
def __iter__(self): for path in super().__iter__(): if self.match_ids: doc_id = os.path.split(path)[1][:-4] if not doc_id in self.match_ids: continue if self.read: if self.as_lines: lines_iter = read_doc_lines(path, encoding=self.encoding) yield lines_iter, path else: raw_text = read_text_file(path, encoding=self.encoding) yield raw_text, path else: yield path