Пример #1
0
 def __init__(self, wombat_path=str()):
     self.wombat_path = wombat_path
     try:
         print('Connecting to wombat path:', self.wombat_path)
         self.wbc = wb_conn(path=self.wombat_path, create_if_missing=False)
     except:
         self.wbc = None
         print("ERROR: unable to locate {} file".format(self.wombat_path))
Пример #2
0
from wombat_api.preprocessors.standard_preprocessor import preprocessor
from wombat_api.core import connector as wb_conn

prepro = preprocessor(name="wombat_standard_preprocessor", phrasefile="")
prepro.pickle("temp/wombat_standard_preprocessor.pkl")

wbpath = "data/wombat-data/"
wbc = wb_conn(path=wbpath, create_if_missing=False)
wbc.assign_preprocessor(
    "algo:glove;dataset:6b;dims:{50,100,200,300};fold:1;unit:token;norm:{none,abtt}",
    "temp/wombat_standard_preprocessor.pkl")

# Calling this method with an empty string as pickle file name removes the preprocessor.
# wbc.assign_preprocessor("algo:glove;dataset:6b;dims:{50,100,200,300};fold:1;unit:token;norm:{none,abtt}", "")
Пример #3
0
def init_scad_resources():
    lock = Lock()
    lock.acquire(blocking=True)
    sd = request.data.decode().strip()
    data = json.loads(sd)

    # TODO Make this more flexible

    if "wombat_path" in data:
        print("Creating global Wombat connector from '%s'" %
              data['wombat_path'])
        classifier.CACHE['wombat'] = wb_conn(path=data['wombat_path'],
                                             create_if_missing=False,
                                             list_contents=True)

    if "english_stemmer" in data:
        print(
            "Creating global Porter stemmer (English only) ['english_stemmer']"
        )
        classifier.CACHE['english_stemmer'] = PorterStemmer(
            mode='NLTK_EXTENSIONS')

    if "english_stopwords" in data:
        print(
            "Creating global stopword list for English ['english_stopwords']")
        classifier.CACHE['english_stopwords'] = set(stopwords.words('english'))

    if "english_pretokenizer" in data:
        print("Creating global Moses tokenizer ['english_pretokenizer']")
        classifier.CACHE['english_pretokenizer'] = MosesTokenizer(lang='en')

    if "token_dblp_idf_path" in data:
        print("Creating global IDF resource from '%s' ['token_dblp_idf'] " %
              data['token_dblp_idf_path'])
        temp_idf = {}
        with open(data['token_dblp_idf_path']) as infile:
            for line in infile:
                try:
                    (key, val) = line.strip().split("\t")
                except ValueError:
                    pass
                temp_idf[key] = float(val)
        classifier.CACHE['token_dblp_idf'] = temp_idf

    if "token_zbmath_idf_path" in data:
        print("Creating global IDF resource from '%s' ['token_zbmath_idf'] " %
              data['token_zbmath_idf_path'])
        temp_idf = {}
        with open(data['token_zbmath_idf_path']) as infile:
            for line in infile:
                try:
                    (key, val) = line.strip().split("\t")
                except ValueError:
                    pass
                temp_idf[key] = float(val)
        classifier.CACHE['token_zbmath_idf'] = temp_idf

    if "stem_dblp_idf_path" in data:
        print("Creating global IDF resource from '%s' ['stem_dblp_idf'] " %
              data['stem_dblp_idf_path'])
        temp_idf = {}
        with open(data['stem_dblp_idf_path']) as infile:
            for line in infile:
                try:
                    (key, val) = line.strip().split("\t")
                except ValueError:
                    pass
                temp_idf[key] = float(val)
        classifier.CACHE['stem_dblp_idf'] = temp_idf

    if "stem_zbmath_idf_path" in data:
        print("Creating global IDF resource from '%s' ['stem_zbmath_idf'] " %
              data['stem_zbmath_idf_path'])
        temp_idf = {}
        with open(data['stem_zbmath_idf_path']) as infile:
            for line in infile:
                try:
                    (key, val) = line.strip().split("\t")
                except ValueError:
                    pass
                temp_idf[key] = float(val)
        classifier.CACHE['stem_zbmath_idf'] = temp_idf

    lock.release()
    return {}  # To make Flask happy ...
def main(args):

    cm_init()
    np.random.seed(4711)
    FWORDS=set()
    PREPRO,WB_CONN=[],[]
    RESULTCACHE,TOKEN_IDF,PUBMETA={},{},{}
    FASTTEXT=None

    if args.mode=="dev":    batches=1
    elif args.mode=="test": batches=10
           
    embname=args.embeddings
    unitlist=args.units.split(",")    
    ttype=args.input
    measurelist=args.measures.split(",")
    print_classifications=args.print_classifications != 'no'    
    print_evidence=args.print_evidence != 'no'
    plot_curves=args.plot_curves != 'no'

    try:
        (sim_start, sim_end, sim_step)=args.sim_ts.split(":")
        # Steps for threshold values during dev, or supplied value test_ts
        ts_vals = np.arange(float(sim_start), float(sim_end), float(sim_step))
    except ValueError:
        ts_vals=[float(args.sim_ts)]

    if "top_n_cos_sim_avg" in measurelist:
        if args.top_n == None:
            print("--top_n required for measure top_n_cos_sim_avg!")
            sys.exit()
        try:            
            (n_start, n_end, n_step)=args.top_n.split(":")
            measurelist=[]
            for n in range(int(n_start), int(n_end), int(n_step)):
                measurelist.append('top_'+str(n)+"_cos_sim_avg")
        except ValueError:
            measurelist[measurelist.index('top_n_cos_sim_avg')]='top_'+str(args.top_n)+"_cos_sim_avg"

    print("Using %s units and %s measures"%(str(len(unitlist)), str(len(measurelist))))

    if embname in ['google', 'glove']:
        WB_CONN.append(wb_conn(path=WOMBAT_PATH, create_if_missing=False))

    with open(IDF_PATH_TOKENS) as infile:
        for line in infile:
            try:                (key, val) = line.strip().split("\t")
            except ValueError:  pass
            TOKEN_IDF[key] = float(val)
        print("Read idf values for %s units from %s"%(str(len(TOKEN_IDF.keys())),IDF_PATH_TOKENS))

    if embname=="fasttext":
        print("Loading fastText model %s"%FASTTEXTFILE)
        FASTTEXT=FastText.load_fasttext_format(FASTTEXTFILE)
        print("done")
           
    with open(FWORDSFILE, "r") as fwf:
        for l in fwf: FWORDS.add(l.strip())
    with open(PREPROFILE, "rb") as ppf: 
        preprocessor = pickle.load(ppf)
    PREPRO.append(preprocessor)    

    pairs,static_pairs=[],[]
    with ExitStack() as stack:        
        # Read and align input files
        infiles = [stack.enter_context(open(i, "r")) for i in [CONCEPTFILE, PROJECTFILE, ANNOFILE]]
        for (concept,project,anno) in zip(*infiles):            
            # Concept
            full_conc_text = concept.split("\t")[1].strip()
            for q in range(len(full_conc_text)):
                # Find first upper case word position, which marks the end of the label and the start of the description
                if full_conc_text[q].lower() != full_conc_text[q]:
                    break # Upper case found

            conc_description_text   = full_conc_text[q:].replace("'"," ")
            conc_label_text         = full_conc_text[:q].replace("'"," ")

            if ttype    == "label":         concept_text=conc_label_text                 
            elif ttype  == "description":   concept_text=conc_description_text                                                                                                 
            elif ttype  == 'both':          concept_text=conc_label_text+" "+conc_description_text                    
            unfolded_tf, tokens=prepro(concept_text, PREPRO, FWORDS)
            PUBMETA[conc_label_text+" "+conc_description_text+"->tf"]=unfolded_tf

            # Project
            (proj_label, proj_title, proj_subject, proj_url, proj_content) = project.split("\t||\t")
            proj_content=proj_content.replace("CONTENT:","").strip().replace("'"," ")
            project_text=proj_content[:proj_content.find("Share your story with Science Buddies")+1].strip().replace("'"," ")
            unfolded_tf, tokens=prepro(project_text, PREPRO, FWORDS)
            PUBMETA[proj_label+"->tf"]=unfolded_tf

            # Label
            label = anno.strip()
            
            # Create labelled instance ...
            e=(proj_label, conc_label_text+" "+conc_description_text, label)
            # ... but ignore duplicates
            if e not in static_pairs: 
                static_pairs.append(e)                
    print("%s unique labelled concept-project pairs were read!"%str(len(static_pairs)))

    # Start experiments
    np.random.shuffle(static_pairs)     # Shuffle all static_pairs once
    ps, rs, fs, tuning_results = [], [], [], []
    batchsize = float(1 / batches)
        
    # Test several parameter combinations
    if      embname == "glove":     emblist = ["algo:glove;dataset:840b;dims:300;fold:0;norm:none;unit:token"]
    elif    embname == "google":    emblist = ["algo:sg;dataset:google-swes;dims:300;fold:0;norm:none;unit:token"]
    elif    embname == "fasttext":  emblist = [embname]    

    results_for_units = []          # For each value of units, collect all results
    results_for_measure = []        # For each value of measure, collect all results
    for emb in emblist:
        for units in unitlist:
            for measure in measurelist:
                plotp, plotr, plotf, plotts = [], [], [], []    # Results per param combi, will collect results for *all* values in ts_values (x axis)
                for ts in ts_vals:
                    if batches == 1:    # TUNING: Get first 20% tuning set from all pairs, create new list. Tuning pairs are the same in each run
                        tpairs = list(static_pairs[:int(len(static_pairs) * 0.2)]) # 20 percent

                    else:               # TESTING: Get last 80% as actual test data
                        tpairs = list(static_pairs[int(len(static_pairs) * 0.2):]) # 80 percent

                    for s in range(batches):                            
                        np.random.shuffle(tpairs)
                        c_pairs = tpairs[:int(len(tpairs) * batchsize)]
                        print("\n\nStarting batch %s of %s (%s instances)"%(str(s+1), str(batches), str(len(c_pairs))))
                        true_labels, pred_labels, mapper=[], [], []

                        # Create classifications for data in current batch, using current setup, incl ts. For dev, this is only done once ofr each setup.
                        for i, (p_id, c_id, l) in enumerate(c_pairs):
                            true_labels.append(int(l))
                            pred_labels.append(0)
                            mapper.append(c_id + " " + p_id)  # For mapping c-p pair to label list index
                            # For avg_cosine, evidence is an empty dummy list
                            sim, evidence = sem_sim(c_id, p_id, measure, units, emb, PUBMETA, WB_CONN, TOKEN_IDF, RESULTCACHE, FASTTEXT)
                            if sim >= ts:
                                pred_labels[mapper.index(c_id + " " + p_id)] = 1

                            if print_classifications:
                                pair_id=c_id + " " + p_id
                                tl=str(true_labels[mapper.index(pair_id)])
                                pl=str(pred_labels[mapper.index(pair_id)])
                                if tl==pl:col   =Back.GREEN
                                else:       col =Back.RED                                    
                                st=col+"True / Pred: %s / %s"+Style.RESET_ALL+" Sim: %s"
                                print(st%(tl, pl ,str(sim)))
                            if print_evidence and evidence != []: 
                                print(evidence)
                                                    
                        # All instances in current batch are classified, using the current setup. Results are in pred_labels.
                        if args.mode == "dev":  # dev mode
                            # Each setup will produce one p,r, and f value, which we collect here                            
                            plotp.append(metrics.precision_score(true_labels, pred_labels))
                            plotr.append(metrics.recall_score(true_labels, pred_labels))
                            plotf.append(metrics.f1_score(true_labels, pred_labels))
                            plotts.append(ts)
                            if print_classifications:
                                print("Batch %s evaluation:\nP: %s, R: %s, F: %s"%(str(s+1),str(plotp[-1]), str(plotr[-1]), str(plotf[-1])))

                        else:           # test mode
                            ps.append(metrics.precision_score(true_labels, pred_labels))
                            rs.append(metrics.recall_score(true_labels, pred_labels))
                            fs.append(metrics.f1_score(true_labels, pred_labels))
                            if print_classifications:
                                print("Batch %s evaluation:\nP: %s, R: %s, F: %s"%(str(s+1),str(ps[-1]), str(rs[-1]), str(fs[-1])))

                    # All batches using the current setup are finished, and their results are collected in plotp, plotr, plotf, and plotts.
                    print("\nTS val %s done"%"{0:.4f}".format(ts))
                # Store all batch results for current measure, include label for plot
                label = measure + "," + ttype + "," + units + "," + embname
                results_for_measure.append((measure, label, plotp, plotr, plotf, plotts))
            results_for_units.append((units, results_for_measure))
            results_for_measure=[]
            # end iteration over measures
        # end iteration over units
    # end iteration over emblist 

    if plot_curves:
        make_plot(results_for_units)

    if args.mode=="test":
        print("Evaluation after %s batches:\n----------------------------"%str(batches))
        print("Embeddings:\t%s\nInput:\t\t%s\nUnits:\t\t%s\nMeasure:\t%s\nMin. Sim:\t%s\n"%(emb,ttype,units,measure,str(ts)))
        print("Mean P: %s (%s)\nMean R: %s (%s)\nMean F: %s (%s)"%(np.mean(ps),np.std(ps), np.mean(rs),np.std(rs), np.mean(fs),np.std(fs)))
Пример #5
0
from wombat_api.core import connector as wb_conn
wbpath = "data/wombat-data/"
importpath = "data/embeddings/glove.6B/"

wbc = wb_conn(path=wbpath, create_if_missing=True)

for d in ['50', '100', '200', '300']:
    for n in ['none', 'abtt']:
        wbc.import_from_file(importpath + "glove.6B." + d + "d.txt",
                             "algo:glove;dataset:6b;dims:" + d +
                             ";fold:1;unit:token;norm:" + n,
                             normalize=n,
                             prepro_picklefile="")