def __init__(self, wombat_path=str()): self.wombat_path = wombat_path try: print('Connecting to wombat path:', self.wombat_path) self.wbc = wb_conn(path=self.wombat_path, create_if_missing=False) except: self.wbc = None print("ERROR: unable to locate {} file".format(self.wombat_path))
from wombat_api.preprocessors.standard_preprocessor import preprocessor from wombat_api.core import connector as wb_conn prepro = preprocessor(name="wombat_standard_preprocessor", phrasefile="") prepro.pickle("temp/wombat_standard_preprocessor.pkl") wbpath = "data/wombat-data/" wbc = wb_conn(path=wbpath, create_if_missing=False) wbc.assign_preprocessor( "algo:glove;dataset:6b;dims:{50,100,200,300};fold:1;unit:token;norm:{none,abtt}", "temp/wombat_standard_preprocessor.pkl") # Calling this method with an empty string as pickle file name removes the preprocessor. # wbc.assign_preprocessor("algo:glove;dataset:6b;dims:{50,100,200,300};fold:1;unit:token;norm:{none,abtt}", "")
def init_scad_resources(): lock = Lock() lock.acquire(blocking=True) sd = request.data.decode().strip() data = json.loads(sd) # TODO Make this more flexible if "wombat_path" in data: print("Creating global Wombat connector from '%s'" % data['wombat_path']) classifier.CACHE['wombat'] = wb_conn(path=data['wombat_path'], create_if_missing=False, list_contents=True) if "english_stemmer" in data: print( "Creating global Porter stemmer (English only) ['english_stemmer']" ) classifier.CACHE['english_stemmer'] = PorterStemmer( mode='NLTK_EXTENSIONS') if "english_stopwords" in data: print( "Creating global stopword list for English ['english_stopwords']") classifier.CACHE['english_stopwords'] = set(stopwords.words('english')) if "english_pretokenizer" in data: print("Creating global Moses tokenizer ['english_pretokenizer']") classifier.CACHE['english_pretokenizer'] = MosesTokenizer(lang='en') if "token_dblp_idf_path" in data: print("Creating global IDF resource from '%s' ['token_dblp_idf'] " % data['token_dblp_idf_path']) temp_idf = {} with open(data['token_dblp_idf_path']) as infile: for line in infile: try: (key, val) = line.strip().split("\t") except ValueError: pass temp_idf[key] = float(val) classifier.CACHE['token_dblp_idf'] = temp_idf if "token_zbmath_idf_path" in data: print("Creating global IDF resource from '%s' ['token_zbmath_idf'] " % data['token_zbmath_idf_path']) temp_idf = {} with open(data['token_zbmath_idf_path']) as infile: for line in infile: try: (key, val) = line.strip().split("\t") except ValueError: pass temp_idf[key] = float(val) classifier.CACHE['token_zbmath_idf'] = temp_idf if "stem_dblp_idf_path" in data: print("Creating global IDF resource from '%s' ['stem_dblp_idf'] " % data['stem_dblp_idf_path']) temp_idf = {} with open(data['stem_dblp_idf_path']) as infile: for line in infile: try: (key, val) = line.strip().split("\t") except ValueError: pass temp_idf[key] = float(val) classifier.CACHE['stem_dblp_idf'] = temp_idf if "stem_zbmath_idf_path" in data: print("Creating global IDF resource from '%s' ['stem_zbmath_idf'] " % data['stem_zbmath_idf_path']) temp_idf = {} with open(data['stem_zbmath_idf_path']) as infile: for line in infile: try: (key, val) = line.strip().split("\t") except ValueError: pass temp_idf[key] = float(val) classifier.CACHE['stem_zbmath_idf'] = temp_idf lock.release() return {} # To make Flask happy ...
def main(args): cm_init() np.random.seed(4711) FWORDS=set() PREPRO,WB_CONN=[],[] RESULTCACHE,TOKEN_IDF,PUBMETA={},{},{} FASTTEXT=None if args.mode=="dev": batches=1 elif args.mode=="test": batches=10 embname=args.embeddings unitlist=args.units.split(",") ttype=args.input measurelist=args.measures.split(",") print_classifications=args.print_classifications != 'no' print_evidence=args.print_evidence != 'no' plot_curves=args.plot_curves != 'no' try: (sim_start, sim_end, sim_step)=args.sim_ts.split(":") # Steps for threshold values during dev, or supplied value test_ts ts_vals = np.arange(float(sim_start), float(sim_end), float(sim_step)) except ValueError: ts_vals=[float(args.sim_ts)] if "top_n_cos_sim_avg" in measurelist: if args.top_n == None: print("--top_n required for measure top_n_cos_sim_avg!") sys.exit() try: (n_start, n_end, n_step)=args.top_n.split(":") measurelist=[] for n in range(int(n_start), int(n_end), int(n_step)): measurelist.append('top_'+str(n)+"_cos_sim_avg") except ValueError: measurelist[measurelist.index('top_n_cos_sim_avg')]='top_'+str(args.top_n)+"_cos_sim_avg" print("Using %s units and %s measures"%(str(len(unitlist)), str(len(measurelist)))) if embname in ['google', 'glove']: WB_CONN.append(wb_conn(path=WOMBAT_PATH, create_if_missing=False)) with open(IDF_PATH_TOKENS) as infile: for line in infile: try: (key, val) = line.strip().split("\t") except ValueError: pass TOKEN_IDF[key] = float(val) print("Read idf values for %s units from %s"%(str(len(TOKEN_IDF.keys())),IDF_PATH_TOKENS)) if embname=="fasttext": print("Loading fastText model %s"%FASTTEXTFILE) FASTTEXT=FastText.load_fasttext_format(FASTTEXTFILE) print("done") with open(FWORDSFILE, "r") as fwf: for l in fwf: FWORDS.add(l.strip()) with open(PREPROFILE, "rb") as ppf: preprocessor = pickle.load(ppf) PREPRO.append(preprocessor) pairs,static_pairs=[],[] with ExitStack() as stack: # Read and align input files infiles = [stack.enter_context(open(i, "r")) for i in [CONCEPTFILE, PROJECTFILE, ANNOFILE]] for (concept,project,anno) in zip(*infiles): # Concept full_conc_text = concept.split("\t")[1].strip() for q in range(len(full_conc_text)): # Find first upper case word position, which marks the end of the label and the start of the description if full_conc_text[q].lower() != full_conc_text[q]: break # Upper case found conc_description_text = full_conc_text[q:].replace("'"," ") conc_label_text = full_conc_text[:q].replace("'"," ") if ttype == "label": concept_text=conc_label_text elif ttype == "description": concept_text=conc_description_text elif ttype == 'both': concept_text=conc_label_text+" "+conc_description_text unfolded_tf, tokens=prepro(concept_text, PREPRO, FWORDS) PUBMETA[conc_label_text+" "+conc_description_text+"->tf"]=unfolded_tf # Project (proj_label, proj_title, proj_subject, proj_url, proj_content) = project.split("\t||\t") proj_content=proj_content.replace("CONTENT:","").strip().replace("'"," ") project_text=proj_content[:proj_content.find("Share your story with Science Buddies")+1].strip().replace("'"," ") unfolded_tf, tokens=prepro(project_text, PREPRO, FWORDS) PUBMETA[proj_label+"->tf"]=unfolded_tf # Label label = anno.strip() # Create labelled instance ... e=(proj_label, conc_label_text+" "+conc_description_text, label) # ... but ignore duplicates if e not in static_pairs: static_pairs.append(e) print("%s unique labelled concept-project pairs were read!"%str(len(static_pairs))) # Start experiments np.random.shuffle(static_pairs) # Shuffle all static_pairs once ps, rs, fs, tuning_results = [], [], [], [] batchsize = float(1 / batches) # Test several parameter combinations if embname == "glove": emblist = ["algo:glove;dataset:840b;dims:300;fold:0;norm:none;unit:token"] elif embname == "google": emblist = ["algo:sg;dataset:google-swes;dims:300;fold:0;norm:none;unit:token"] elif embname == "fasttext": emblist = [embname] results_for_units = [] # For each value of units, collect all results results_for_measure = [] # For each value of measure, collect all results for emb in emblist: for units in unitlist: for measure in measurelist: plotp, plotr, plotf, plotts = [], [], [], [] # Results per param combi, will collect results for *all* values in ts_values (x axis) for ts in ts_vals: if batches == 1: # TUNING: Get first 20% tuning set from all pairs, create new list. Tuning pairs are the same in each run tpairs = list(static_pairs[:int(len(static_pairs) * 0.2)]) # 20 percent else: # TESTING: Get last 80% as actual test data tpairs = list(static_pairs[int(len(static_pairs) * 0.2):]) # 80 percent for s in range(batches): np.random.shuffle(tpairs) c_pairs = tpairs[:int(len(tpairs) * batchsize)] print("\n\nStarting batch %s of %s (%s instances)"%(str(s+1), str(batches), str(len(c_pairs)))) true_labels, pred_labels, mapper=[], [], [] # Create classifications for data in current batch, using current setup, incl ts. For dev, this is only done once ofr each setup. for i, (p_id, c_id, l) in enumerate(c_pairs): true_labels.append(int(l)) pred_labels.append(0) mapper.append(c_id + " " + p_id) # For mapping c-p pair to label list index # For avg_cosine, evidence is an empty dummy list sim, evidence = sem_sim(c_id, p_id, measure, units, emb, PUBMETA, WB_CONN, TOKEN_IDF, RESULTCACHE, FASTTEXT) if sim >= ts: pred_labels[mapper.index(c_id + " " + p_id)] = 1 if print_classifications: pair_id=c_id + " " + p_id tl=str(true_labels[mapper.index(pair_id)]) pl=str(pred_labels[mapper.index(pair_id)]) if tl==pl:col =Back.GREEN else: col =Back.RED st=col+"True / Pred: %s / %s"+Style.RESET_ALL+" Sim: %s" print(st%(tl, pl ,str(sim))) if print_evidence and evidence != []: print(evidence) # All instances in current batch are classified, using the current setup. Results are in pred_labels. if args.mode == "dev": # dev mode # Each setup will produce one p,r, and f value, which we collect here plotp.append(metrics.precision_score(true_labels, pred_labels)) plotr.append(metrics.recall_score(true_labels, pred_labels)) plotf.append(metrics.f1_score(true_labels, pred_labels)) plotts.append(ts) if print_classifications: print("Batch %s evaluation:\nP: %s, R: %s, F: %s"%(str(s+1),str(plotp[-1]), str(plotr[-1]), str(plotf[-1]))) else: # test mode ps.append(metrics.precision_score(true_labels, pred_labels)) rs.append(metrics.recall_score(true_labels, pred_labels)) fs.append(metrics.f1_score(true_labels, pred_labels)) if print_classifications: print("Batch %s evaluation:\nP: %s, R: %s, F: %s"%(str(s+1),str(ps[-1]), str(rs[-1]), str(fs[-1]))) # All batches using the current setup are finished, and their results are collected in plotp, plotr, plotf, and plotts. print("\nTS val %s done"%"{0:.4f}".format(ts)) # Store all batch results for current measure, include label for plot label = measure + "," + ttype + "," + units + "," + embname results_for_measure.append((measure, label, plotp, plotr, plotf, plotts)) results_for_units.append((units, results_for_measure)) results_for_measure=[] # end iteration over measures # end iteration over units # end iteration over emblist if plot_curves: make_plot(results_for_units) if args.mode=="test": print("Evaluation after %s batches:\n----------------------------"%str(batches)) print("Embeddings:\t%s\nInput:\t\t%s\nUnits:\t\t%s\nMeasure:\t%s\nMin. Sim:\t%s\n"%(emb,ttype,units,measure,str(ts))) print("Mean P: %s (%s)\nMean R: %s (%s)\nMean F: %s (%s)"%(np.mean(ps),np.std(ps), np.mean(rs),np.std(rs), np.mean(fs),np.std(fs)))
from wombat_api.core import connector as wb_conn wbpath = "data/wombat-data/" importpath = "data/embeddings/glove.6B/" wbc = wb_conn(path=wbpath, create_if_missing=True) for d in ['50', '100', '200', '300']: for n in ['none', 'abtt']: wbc.import_from_file(importpath + "glove.6B." + d + "d.txt", "algo:glove;dataset:6b;dims:" + d + ";fold:1;unit:token;norm:" + n, normalize=n, prepro_picklefile="")