def featurize(p, featurefns): a = [] for accumulator in accumulators: a.append(accumulator(p)) x = [] for featurefn in featurefns: try: fs = featurefn(a) for f in fs: try: len(f) printd("Bad feature from %s" % featurefn.__name__, -1) printd(f, -1) sys.exit(-1) except TypeError: pass x.extend(fs) except Exception: print >> sys.stderr, "Pair:", p.pid print >> sys.stderr, "%d" % len(p.s1["vector"]), " ".join( p.s1["wv_tokens"]) print >> sys.stderr, "%d" % len(p.s2["vector"]), " ".join( p.s2["wv_tokens"]) traceback.print_exc() sys.exit(-1) if len(x) == 0 and len(featurefns) != 0: print >> sys.stderr, "Pair:", p.pid print >> sys.stderr, "%d" % len(p.s1["vector"]), " ".join( p.s1["wv_tokens"]) print >> sys.stderr, "%d" % len(p.s2["vector"]), " ".join( p.s2["wv_tokens"]) sys.exit(-1) return x
def __init__(self, path, neg_samples=5): random.seed(42) pfiles = os.listdir(path) printd("Loading dataset") alldata = [] size = 0 if "train" in pfiles: for d in ["train", "test", "valid"]: if d not in pfiles: continue dpath = os.path.join(path, d) nset = NuggetSet(dpath, neg_samples) size += nset.size alldata.extend([nset.nuggets, nset.updates]) setattr(self, d, nset.pairs) else: nset = NuggetSet(path, neg_samples) alldata.extend([nset.nuggets, nset.updates]) self.test = nset.pairs size += nset.size self.size = size self.data = Superset(*alldata) self.writer = nset.writer if len(self.valid()) != 0: self._train = SuperList(self.train(), self.valid()) self.train = lambda: self._train
def featurize(p, featurefns): a = [] for accumulator in accumulators: a.append(accumulator(p)) x = [] for featurefn in featurefns: try: fs = featurefn(a) for f in fs: try: len(f) printd("Bad feature from %s" % featurefn.__name__, -1) printd(f, -1) sys.exit(-1) except TypeError: pass x.extend(fs) except Exception: print >>sys.stderr, "Pair:", p.pid print >>sys.stderr, "%d" % len(p.s1["vector"]), " ".join(p.s1["wv_tokens"]) print >>sys.stderr, "%d" % len(p.s2["vector"]), " ".join(p.s2["wv_tokens"]) traceback.print_exc() sys.exit(-1) if len(x) == 0 and len(featurefns) != 0: print >>sys.stderr, "Pair:", p.pid print >>sys.stderr, "%d" % len(p.s1["vector"]), " ".join(p.s1["wv_tokens"]) print >>sys.stderr, "%d" % len(p.s2["vector"]), " ".join(p.s2["wv_tokens"]) sys.exit(-1) return x
def normalize(self): wv = self.wordvec printd("Normalizing: wv is %s:" % (str(wv.syn0.shape))) # Shfit to Mean = 0 #means = np.mean(wv.syn0, axis=0) #scales = 1 / np.maximum(abs(means), abs(1 - means)) #wv.syn0 = ((wv.syn0 - means) / scales).astype(np.float32) wv.syn0 -= np.mean(wv.syn0, axis=0) # Cut off negative and add new vector as positive wv.syn0 = np.concatenate((np.maximum(0, wv.syn0), np.maximum(0, -wv.syn0)), axis=1) # Unit norm wv.syn0 = sklearn.preprocessing.normalize(wv.syn0, axis=0) self.size *= 2 wv.vector_size = self.size printd("Done normalizing: wv is %s:" % (str(wv.syn0.shape)))
def maxShortSentence(self): ls = Cycle([0, 0]) try: for dset in self.data: l = ls.nextitem() for s in dset: cl = len(s["wv_tokens"]) if cl > l: l = cl ls.setitem(l) except KeyError, e: printd(e, -1) printd(s, -1) traceback.print_stack() sys.exit(-1)
def normalize(self): wv = self.wordvec printd("Normalizing: wv is %s:" % (str(wv.syn0.shape))) # Shfit to Mean = 0 #means = np.mean(wv.syn0, axis=0) #scales = 1 / np.maximum(abs(means), abs(1 - means)) #wv.syn0 = ((wv.syn0 - means) / scales).astype(np.float32) wv.syn0 -= np.mean(wv.syn0, axis=0) # Cut off negative and add new vector as positive wv.syn0 = np.concatenate( (np.maximum(0, wv.syn0), np.maximum(0, -wv.syn0)), axis=1) # Unit norm wv.syn0 = sklearn.preprocessing.normalize(wv.syn0, axis=0) self.size *= 2 wv.vector_size = self.size printd("Done normalizing: wv is %s:" % (str(wv.syn0.shape)))
def logdf(self, tf=None): if tf is None: if self.__logdf: return self.__logdf return np.zeros(self.size) logdf = np.zeros(self.size) nterms = 0 for t, tc in tf.items(): if t in self.wordvec: logdf += self[t] * tc nterms += 1 self.__logdf = np.nan_to_num(np.log2(logdf)) printd("Found %d (%0.2f%% toks, %0.2f%% wv) terms from vocab in wordvec" % (nterms, 100 * nterms / len(tf), 100 * nterms / len(self.wordvec.vocab))) return self.__logdf
def logdf(self, tf=None): if tf is None: if self.__logdf: return self.__logdf return np.zeros(self.size) logdf = np.zeros(self.size) nterms = 0 for t, tc in tf.items(): if t in self.wordvec: logdf += self[t] * tc nterms += 1 self.__logdf = np.nan_to_num(np.log2(logdf)) printd( "Found %d (%0.2f%% toks, %0.2f%% wv) terms from vocab in wordvec" % (nterms, 100 * nterms / len(tf), 100 * nterms / len(self.wordvec.vocab))) return self.__logdf
def munkres_handler(signum, frame): printd("Can't keep waiting...") print frame raise Exception("ran out of time...")
def main(args): global wordvec, wordvecf conf.debug = args.debug or args.verbose conf.verbose = args.verbose conf.args = args #nf = args.nuggets #uf = args.updates #mf = args.matches sf = args.shingles vf = args.wordvec #ef = args.evalfile wvout = args.wvfile sim_thr = args.sim_thr dset = args.dataset limit = args.limit #if args.dataset == "auto": # if ef is not None: # dset = "semeval" # else: # with open(glob.glob(nf)[0]) as nh: # nfhead = nh.readline() # if nfhead.startswith("query_id\tnugget_id"): # dset = "ts" # elif nfhead.startswith("query_id\tvs_id"): # dset = "mclick" # else: # dset = "1click" if os.path.exists(wvout) and not args.force: wordvecf = wvout if vf: printd("Reading word vector...") #wordvec = load_wordvec() wordvec = WordVec(wordvecf) if args.sim == "minsim": matcher = MinDistSim elif args.sim == "infsim": matcher = InfSim else: matcher = VecSim if args.sim == "infsim" or args.comparator == "infsim": wordvec.normalize() #if dset == "ts": # nuggfn = Nuggets # updfn = Updates # outfn = MatchWriter #elif dset == "1click": # nuggfn = CLNuggets # updfn = CLUpdates # outfn = CLMatchWriter #elif dset == "mclick": # nuggfn = MCNuggets # updfn = Updates # outfn = MCMatchWriter #elif dset == "semeval": # data = SemEvalDataset(args.input_data, args.evalfile) # outfn = data.writer # if vf is not None: # data.vectorize(wordvec) #else: # nuggfn = MCNuggets # updfn = Updates # outfn = MCMatchWriter data = Dataset.load(args.input_data, dset) if vf is not None: data.vectorize(wordvec) #if dset == "semeval": # data = SemEvalDataset(args.input_data, args.evalfile) # #outfn = data.writer # if vf is not None: # data.vectorize(wordvec) #else: # printd("Processing Nuggets...") # #nuggets = nuggfn(nf, vectorize=vf is not None) # printd("Processing Updates...") # #updates = updfn(uf, vectorize=vf is not None) # #data = NuggetDataset(nuggets, updates, mf) # data = NuggetDataset(nf, uf, mf, dset=dset, vectorize=vf is not None) if vf and wvout is not None and wvout != wordvecf: printd("Rereading word vectors to optimize...") wv_toks = data.wv_sentences() #if dset == "semeval": # wv_toks = data.wv_sentences() #else: # wv_toks = nuggets.wv_text() + updates.wv_text() wordvec = WordVec(wordvecf, sentences=wv_toks, wvout=wvout, size=wordvec.originalsize) if args.sim == "infsim" or args.comparator == "infsim": wordvec.normalize() data.vectorize(wordvec) with open(wvout + ".vocab", 'w') as wh: wh.write("\n".join(wordvec.vocab().keys())) with open(wvout + ".toks", 'w') as wh: wh.write("\n".join([" ".join(x) for x in wv_toks])) #vocab = nuggets.wv_vocab().union(updates.wv_vocab()) #wordvec.trim(lambda word, count, min_count: gensim.utils.RULE_KEEP if word in vocab else gensim.utils.RULE_DISCARD) #wordvec.save(wvout) vocab = None if args.frequencies: try: with open(args.frequencies) as fh: vocab = json.load(fh) # For Term Frequencies instead of Document Frequencies # Could also do len(vocab[word]) if wanted to mimic DF if type(vocab.itervalues().next()) == dict: for word in vocab: vocab[word] = sum(vocab[word].itervalues()) except Exception: pass if vocab is None: vocab = data.wv_vocab() logdf = wordvec.logdf(vocab) logdffile = wordvecf + ".logdf" #if not os.path.exists(logdffile) or (os.path.getmtime(logdffile) < os.path.getmtime(wordvecf)): # np.savetxt(logdffile, logdf, delimiter=" ", fmt="%g") np.savetxt(logdffile, logdf, delimiter=" ", fmt="%g") if args.comparator == "infsim" and args.sim != "infsim": comparator = InfSim(logdf).pairwisedist else: comparator = args.comparator matcher = matcher(df=logdf, metric=comparator) data.normalize(matcher, logdf) printd("Finding matches...") matches = [] with data.writer(sf) as sw, data.writer(vf) as vw: mcnt = 0 timer = Timer() for pair in data.test(): if sf: match = shingle(pair.s1["tokens"], pair.s2["tokens"]) if match.score >= min_score: sw.write(pair, match) if vf: printd("Matching pair %s" % (pair.pid), level=1) try: sim = matcher.match(pair) matches.append((matcher.tsim, unicode(matcher))) except ValueError, err: printd(err) sim = sim_thr printd("Match %0.4f for %s, %s" % (sim, pair.sid1, pair.sid2)) if sim < sim_thr: sim = sim_thr start = matcher.start end = matcher.end - matcher.start else: start = -1 end = len(pair.s2["tokens"]) - 1 match = Match(sim, start, end) vw.write(pair, match) mcnt += 1 if (mcnt % 100000) == 0: print >> sys.stderr, "%g tmps" % (100 / timer.mark()) if limit and mcnt >= limit: return if conf.verbose: for tsim, match in sorted(matches): print match
def normalize(self, matcher, df): printd("Normalizing dset") for rid, rec in self.data.iteritems(): rec["vector"], rec["vector_sum"] = matcher.normalize(rec["vector"], df)
def normalize(self, matcher, df): printd("Normalizing dset") for rid, rec in self.data.iteritems(): rec["vector"], rec["vector_sum"] = matcher.normalize( rec["vector"], df)
def processData(args): data = dataset.Dataset.load(args.input_data, args.dataset) wvout = args.wvfile if os.path.exists(wvout): wordvecf = wvout else: wordvecf = args.wvsource features = {x for x in args.basefeatures.split(',') if x != ''} matchers = {x for x in args.matchers.split(',') if x != ''} printd("Loading Word Vectors") wordvec = WordVec(wordvecf) printd("Vectorizing") data.vectorize(wordvec) maxwords = data.maxShortSentence() if wvout != wordvecf: printd("Rereading word vectors to optimize...") wv_toks = data.wv_sentences() wordvec = WordVec(wordvecf, sentences=wv_toks, wvout=wvout, size=wordvec.originalsize) data.vectorize(wordvec) conf.wvsize = wordvec.size # Train data printd("Computing basic WV Features") fs = FeatureSet(data, features) if "Pair" in matchers: printd("Computing Pair Features") matcher = vectorsim.PairFeatures(dimfeatures=args.dimfeatures) fs.addMatcher(matcher) if "Shingle" in matchers: printd("Computing Shingle Features") matcher = Shingler(slop=12, lmbda=0.95) fs.addMatcher(matcher) vocab = None if "MinDistSim" in matchers: printd("Computing MinDist") vocab = fs.data.wv_vocab() data.weight() comparator = 'cosine' matcher = vectorsim.MinDistSim(metric=comparator, df=vocab, maxsent=maxwords, dimfeatures=args.dimfeatures) fs.addMatcher(matcher, 'cos') printd("Computing MinDist-Euclidean") comparator = 'euclidean' matcher = vectorsim.MinDistSim(metric=comparator, df=vocab, maxsent=maxwords, dimfeatures=args.dimfeatures) fs.addMatcher(matcher, 'euc') if "NGram" in matchers: printd("Computing MinDist-Ngram") vocab = fs.data.wv_vocab() if vocab is None: vocab = fs.data.wv_vocab() comparator = 'cosine' matcher = vectorsim.MinDistSim(metric=comparator, df=vocab, maxsent=maxwords, ngram=2, dimfeatures=args.dimfeatures) fs.addMatcher(matcher, 'cos-bigram') comparator = 'cosine' matcher = vectorsim.MinDistSim(metric=comparator, df=vocab, maxsent=maxwords, ngram=3, dimfeatures=args.dimfeatures) fs.addMatcher(matcher, 'cos-trigram') if "WWSim" in matchers: printd("Computing WWSim") matcher = vectorsim.WWSim(wordvec=wordvec, dimfeatures=args.dimfeatures) fs.addMatcher(matcher) if "InfRankSim" in matchers: printd("Computing InfRankSim") matcher = vectorsim.InfRankSim(data=data, wordvec=wordvec, dimfeatures=args.dimfeatures) printd("InfRankSim Matching") fs.addMatcher(matcher) if "InfSim" in matchers: # We normalize after so primary features are raw word vectors # InfSim printd("Computing InfSim") wordvec.normalize() data.vectorize(wordvec) matcher = vectorsim.InfSim(data=data, wordvec=wordvec, dimfeatures=args.dimfeatures) fs.addMatcher(matcher) return fs
def addMatcher(self, matcher, namebase=""): names = [namebase + x for x in matcher.names()] self.names.extend(names) for datum, pair in chain(izip(self.train, self.data.train()), izip(self.test, self.data.test())): try: matcher.match(pair) except ValueError, err: printd(err) fs = matcher.features() if len(names) != len(fs): printd( "Incorrect names for features for %s: %d vs %d" % (matcher.__class__.__name__, len(names), len(fs)), -1) printd(names, -1) sys.exit(-1) for f in fs: # Checking if np.isnan(f) or np.isinf(f): printd("Bad feature from %s" % matcher.__class__.__name__, -1) printd(f, -1) printd(pair, -1) sys.exit(-1) datum.extend(fs)
def main(args): conf.debug = args.debug conf.verbose = args.verbose conf.args = args if args.alphas == "auto": pass # TODO else: conf.alphas = [float(x) for x in args.alphas.split(",")] try: args.max_features = int(args.max_features) except (ValueError, TypeError): try: args.max_features = float(args.max_features) except (ValueError, TypeError): pass fitargs = {} predictargs = {} if args.featurefile and not args.force and os.path.exists( args.featurefile + ".train"): printd("Loading Saved Features") fs = FeatureSet.read(args.featurefile) else: fs = processData(args) if args.featurefile: printd("Writing Features") fs.write(args.featurefile) #kpca = KernelPCA(kernel="rbf", gamma=10) if args.model == "randomforests": if args.classify: model = ensemble.RandomForestClassifier else: model = ensemble.RandomForestRegressor elif args.model == "extratrees": if args.classify: model = ensemble.ExtraTreesClassifier else: model = ensemble.ExtraTreesRegressor elif args.model == "gradientboosting": if args.classify: model = ensemble.GradientBoostingClassifier else: model = ensemble.GradientBoostingRegressor elif args.model == "decisiontree": model = DecisionTreeRegressor elif args.model == "adaboost": model = ensemble.AdaBoostRegressor elif args.model == "linreg": model = LinearRegression elif args.model == "autolearn": printd("AutoLearn disabled as it does not work properly") sys.exit(-1) #model = AutoSklearnClassifier fitargs["dataset_name"] = "semeval" elif args.model == "nn": model = NNModel fitargs["nb_epoch"] = 10 fitargs["batch_size"] = 32 fitargs["verbose"] = 2 predictargs["verbose"] = 0 elif args.model == "None": printd("No Model specified, exiting") sys.exit(-1) else: printd("Invalid model %s" % args.model) sys.exit(-1) if args.classify: # Forest Classifiers do not allow non-binary labels, so we do it by sample weight instead byweight = issubclass(model, ensemble.forest.ForestClassifier) lintrainlabels = np.copy(fs.trainlabels) fs.discretizeLabels(byweight=byweight) if byweight: fitargs["sample_weight"] = fs.trainweights else: lintrainlabels = np.array(fs.trainlabels) fs.freeze() printd("Train labels:" + str(fs.trainlabels.shape)) if (not args.force) and args.modelfile and os.path.exists(args.modelfile): if args.model == "nn": import keras model = keras.models.load_model(args.modelfile) else: model = joblib.load(args.modelfile) else: params = default_params[args.model] for param_name, param_value in params.items(): try: pval = getattr(args, param_name) if pval is not None: params[param_name] = pval except AttributeError: pass if "input_dim" in params: # -1 for label params["input_dim"] = len(fs.names) - 1 model = model(**params) if args.gridsearch: model = GridSearchCV(model, scoring=evalModel, cv=5, error_score=0, param_grid=param_grids[args.model], n_jobs=16, pre_dispatch="2*n_jobs", verbose=10) #model = Pipeline(steps=[('pca', kpca), ('dtree', dtree)]) printd("Training") model.fit(fs.train, fs.trainlabels, **fitargs) #X_kpca = kpca.fit_transform(X) #dtree.fit(traindata, trainlabels) if args.modelfile: try: if args.model == "nn": model.save(args.modelfile) else: joblib.dump(model, args.modelfile) except Exception: printd( "Could not save model, autolearn does not support saving") printd("Evaluating") print "Using Features: %s" % args.basefeatures print "Using Matchers: %s" % args.matchers print "Train Accuracy" evalData(model=model, data=fs.train, labels=lintrainlabels, classify=args.classify, obs=model.oob_prediction_, **predictargs) # trainobs = _ print "Test Accuracy" testobs = evalData(model=model, data=fs.test, labels=fs.testlabels, classify=args.classify, **predictargs) if args.writematches: try: fs.data.writer except AttributeError: fs.data = dataset.Dataset.load(args.input_data, args.dataset) trainwriter = fs.data.writer(args.writematches + ".train") testwriter = fs.data.writer(args.writematches + ".test") for pair in fs.data.train(): trainwriter.write(pair, Match(score=pair.label, autop=0)) for pair, obs in izip(fs.data.test(), testobs): testwriter.write(pair, Match(score=obs))
def addMatcher(self, matcher, namebase=""): names = [namebase + x for x in matcher.names()] self.names.extend(names) for datum, pair in chain(izip(self.train, self.data.train()), izip(self.test, self.data.test())): try: matcher.match(pair) except ValueError, err: printd(err) fs = matcher.features() if len(names) != len(fs): printd("Incorrect names for features for %s: %d vs %d" % (matcher.__class__.__name__, len(names), len(fs)), -1) printd(names, -1) sys.exit(-1) for f in fs: # Checking if np.isnan(f) or np.isinf(f): printd("Bad feature from %s" % matcher.__class__.__name__, -1) printd(f, -1) printd(pair, -1) sys.exit(-1) datum.extend(fs)
def normalize(cls, s, df): if len(s) == 0: return s, 0 if np.any(np.isnan(df)): printd("Hmm, nan for df %0.4f") printd("df:\n" + str(df)) sys.exit(1) # TODO: This should be a weighted sum with IDF # As a result of this sum, different length sentences naturally receive a # penalty, as the sum is naturally larger than the min. # Also, we aren't looking at euclidean distance, so we may be losing out on scale information # But if we did, longer sentences would be harder to match together (as distances would compound). # Maybe should divide by min sentence legnth or something of the sort... #This is avg, not sum................................................... # probably causes all sorts of weirdness ps = np.sum(s, axis=0) / np.sum(s) if np.any(np.isnan(ps)): printd("Hmm, nan for ps %0.4f" % np.sum(s)) printd("ps:\n" + str(ps)) printd("s:\n" + str(s)) printd("df:\n" + str(df)) sys.exit(1) ts = np.sum(np.multiply(ps, df)) if ts == 0: printd("Hmm, 0 for ts") printd("ps:\n" + str(ps)) printd("df:\n" + str(df)) sys.exit(1) return ps, ts
def main(args): conf.debug = args.debug conf.verbose = args.verbose conf.args = args if args.alphas == "auto": pass # TODO else: conf.alphas = [float(x) for x in args.alphas.split(",")] try: args.max_features = int(args.max_features) except (ValueError, TypeError): try: args.max_features = float(args.max_features) except (ValueError, TypeError): pass fitargs = {} predictargs = {} if args.featurefile and not args.force and os.path.exists(args.featurefile + ".train"): printd("Loading Saved Features") fs = FeatureSet.read(args.featurefile) else: fs = processData(args) if args.featurefile: printd("Writing Features") fs.write(args.featurefile) #kpca = KernelPCA(kernel="rbf", gamma=10) if args.model == "randomforests": if args.classify: model = ensemble.RandomForestClassifier else: model = ensemble.RandomForestRegressor elif args.model == "extratrees": if args.classify: model = ensemble.ExtraTreesClassifier else: model = ensemble.ExtraTreesRegressor elif args.model == "gradientboosting": if args.classify: model = ensemble.GradientBoostingClassifier else: model = ensemble.GradientBoostingRegressor elif args.model == "decisiontree": model = DecisionTreeRegressor elif args.model == "adaboost": model = ensemble.AdaBoostRegressor elif args.model == "linreg": model = LinearRegression elif args.model == "autolearn": printd("AutoLearn disabled as it does not work properly") sys.exit(-1) #model = AutoSklearnClassifier fitargs["dataset_name"] = "semeval" elif args.model == "nn": model = NNModel fitargs["nb_epoch"] = 10 fitargs["batch_size"] = 32 fitargs["verbose"] = 2 predictargs["verbose"] = 0 elif args.model == "None": printd("No Model specified, exiting") sys.exit(-1) else: printd("Invalid model %s" % args.model) sys.exit(-1) if args.classify: # Forest Classifiers do not allow non-binary labels, so we do it by sample weight instead byweight = issubclass(model, ensemble.forest.ForestClassifier) lintrainlabels = np.copy(fs.trainlabels) fs.discretizeLabels(byweight=byweight) if byweight: fitargs["sample_weight"] = fs.trainweights else: lintrainlabels = np.array(fs.trainlabels) fs.freeze() printd("Train labels:" + str(fs.trainlabels.shape)) if (not args.force) and args.modelfile and os.path.exists(args.modelfile): if args.model == "nn": import keras model = keras.models.load_model(args.modelfile) else: model = joblib.load(args.modelfile) else: params = default_params[args.model] for param_name, param_value in params.items(): try: pval = getattr(args, param_name) if pval is not None: params[param_name] = pval except AttributeError: pass if "input_dim" in params: # -1 for label params["input_dim"] = len(fs.names) - 1 model = model(**params) if args.gridsearch: model = GridSearchCV(model, scoring=evalModel, cv=5, error_score=0, param_grid=param_grids[args.model], n_jobs=16, pre_dispatch="2*n_jobs", verbose=10) #model = Pipeline(steps=[('pca', kpca), ('dtree', dtree)]) printd("Training") model.fit(fs.train, fs.trainlabels, **fitargs) #X_kpca = kpca.fit_transform(X) #dtree.fit(traindata, trainlabels) if args.modelfile: try: if args.model == "nn": model.save(args.modelfile) else: joblib.dump(model, args.modelfile) except Exception: printd("Could not save model, autolearn does not support saving") printd("Evaluating") print "Using Features: %s" % args.basefeatures print "Using Matchers: %s" % args.matchers print "Train Accuracy" evalData(model=model, data=fs.train, labels=lintrainlabels, classify=args.classify, obs=model.oob_prediction_, **predictargs) # trainobs = _ print "Test Accuracy" testobs = evalData(model=model, data=fs.test, labels=fs.testlabels, classify=args.classify, **predictargs) if args.writematches: try: fs.data.writer except AttributeError: fs.data = dataset.Dataset.load(args.input_data, args.dataset) trainwriter = fs.data.writer(args.writematches + ".train") testwriter = fs.data.writer(args.writematches + ".test") for pair in fs.data.train(): trainwriter.write(pair, Match(score=pair.label, autop=0)) for pair, obs in izip(fs.data.test(), testobs): testwriter.write(pair, Match(score=obs))
class MinDistSim(Matcher): def __init__(self, df=None, metric='cosine', maxsent=20, ngram=1, recurse=False, dimfeatures=True): #self.dist = ndist(s1, s2) #s1 = s1["vector"] #s2 = s2["vector"] self.metric = getMetric(metric) self._names = ["MDS_" + x for x in ["tsim", "lsim", "kdist", "kldist", "ldist", "kt", "tmax", "tmin", "tsum", "tstd", "tmaxidf", "tsumidf"]] maxsent = maxsent - ngram + 1 if dimfeatures: self._names.extend(["MDS_w%03d" % x for x in range(maxsent)]) self.maxsent = maxsent self.ngram = ngram self.recurse = recurse self.vocab = df self.wordcount = df.total self.dimfeatures = dimfeatures def match(self, pair): s1l = len(pair.s1["vector"]) s2l = len(pair.s2["vector"]) self.tsim = float('-9999') self.lsim = float('-9999') self.minlen = min(s1l, s2l) self.maxlen = max(s1l, s2l) self.nmatches = 0 self.start = -1 self.end = -1 if (self.minlen == 0 or self.maxlen >= 100): return self.tsim # For simplicity in later code, make the shorter one first if s1l < s2l: self.s1 = pair.s1 self.s2 = pair.s2 s1l = len(pair.s1["vector"]) s2l = len(pair.s2["vector"]) else: self.s1 = pair.s2 self.s2 = pair.s1 wc = self.wordcount if "wv_idfs" not in self.s1: self.s1["wv_idfs"] = [math.log(wc / self.vocab[x], 2) for x in self.s1["wv_tokens"]] if "wv_idfs" not in self.s2: self.s2["wv_idfs"] = [math.log(wc / self.vocab[x], 2) for x in self.s2["wv_tokens"]] if self.ngram > 1: ng = self.ngram v1 = self.s1["vector"] v2 = self.s2["vector"] t1 = self.s1["wv_tokens"] t2 = self.s2["wv_tokens"] #idf1 = self.s1["wv_idfs"] #idf2 = self.s2["wv_idfs"] weights1 = self.s1["weights"] weights2 = self.s2["weights"] nv1 = [sum(v1[i:i + ng]) for i in range(max(1, len(v1) - ng + 1))] nv2 = [sum(v2[i:i + ng]) for i in range(max(1, len(v2) - ng + 1))] nt1 = ["_".join(t1[i:i + ng]) for i in range(max(1, len(t1) - ng + 1))] nt2 = ["_".join(t2[i:i + ng]) for i in range(max(1, len(t2) - ng + 1))] #nidf1 = [max(idf1[i:i + ng]) for i in range(max(1, len(idf1) - ng + 1))] #nidf2 = [max(idf2[i:i + ng]) for i in range(max(1, len(idf2) - ng + 1))] nweights1 = [max(weights1[i:i + ng]) for i in range(max(1, len(weights1) - ng + 1))] nweights2 = [max(weights2[i:i + ng]) for i in range(max(1, len(weights2) - ng + 1))] #self.s1 = {"vector": nv1, "wv_tokens": nt1, "wv_idfs": nidf1} #self.s2 = {"vector": nv2, "wv_tokens": nt2, "wv_idfs": nidf2} self.s1 = {"vector": nv1, "wv_tokens": nt1, "weights": nweights1} self.s2 = {"vector": nv2, "wv_tokens": nt2, "weights": nweights2} self.minlen = max(self.minlen - ng + 1, 1) self.maxlen = max(self.maxlen - ng + 1, 1) self.dists = [1] * self.minlen self.pair = pair #self.dist = pairdist(self.s1["vector"], self.s2["vector"], fn=self.metric) #self.dist = pairdist(self.s1, self.s2, fn=self.metric) dist = self.metric(self.s1, self.s2) # scale by max of idf #for i in range(dist.shape[0]): # for j in range(dist.shape[1]): # dist[i][j] *= max(self.s1["wv_idfs"][i], self.s2["wv_idfs"][j]) self.matchv = np.zeros(dist.shape, int) np.fill_diagonal(self.matchv, 1) if np.sum(dist) == 0: self.tsim = 1 self.nmatches = min(dist.shape) self.start = 0 self.end = dist.shape[1] - 1 return self.tsim if (dist == dist[0]).all(): self.tsim = 1 - sum(dist[0]) self.nmatches = min(dist.shape) self.start = 0 self.end = dist.shape[1] - 1 return self.tsim if (dist.T == dist[:, 0]).all(): self.tsim = 1 - sum(dist[:, 0]) self.nmatches = min(dist.shape) self.start = 0 self.end = dist.shape[1] - 1 return self.tsim signal.signal(signal.SIGALRM, munkres_handler) signal.alarm(10) try: matches = munkres(dist) except Exception, e: printd(e) printd("dist: " + dist.shape) printd(dist) return self.tsim signal.alarm(0) self.matchv = matches tdist = 0 tmaxidf = 0 tsumidf = 0 nmatches = 0 mstart = dist.shape[1] mend = 0 #print self.s1["text"] #print self.s2["text"] #print " ".join(self.s1["wv_tokens"]) #print " ".join(self.s2["wv_tokens"]) s1tok = self.s1["wv_tokens"] s2tok = self.s2["wv_tokens"] matcharr = [0] * matches.shape[0] dists = [0] * matches.shape[0] matchedy = [0] * matches.shape[1] for i in range(matches.shape[0]): for j in range(matches.shape[1]): if matches[i, j]: matchedy[j] = 1 tdist += dist[i, j] #tmaxidf += dist[i, j] * max(self.s1["wv_idfs"][i], self.s2["wv_idfs"][j]) #tsumidf += dist[i, j] * sum((self.s1["wv_idfs"][i], self.s2["wv_idfs"][j])) wi = self.s1["weights"][i] wj = self.s2["weights"][j] tmaxidf += dist[i, j] * max(wi, wj) tsumidf += dist[i, j] * sum((wi, wj)) printd("%s\t%s\t%0.4f\t%0.4f\t%0.4f" % (s1tok[i], s2tok[j], dist[i, j], wi, wj), level=1, sock=sys.stdout) nmatches += 1 matcharr[i] = j dists[i] = dist[i, j] if j < mstart: mstart = j if j > mend: mend = j ldist = tdist tdist = tdist * max(dist.shape) / pow(min(dist.shape), 2) tmaxidf = tmaxidf * max(dist.shape) / pow(min(dist.shape), 2) tsumidf = tsumidf * max(dist.shape) / pow(min(dist.shape), 2) kt, ktp = kendalltau(range(len(matcharr)), matcharr) printd("Score: %0.4f\t%0.4f\t%0.4f\tLabel: %g\n" % (tdist, tmaxidf, tsumidf, pair.label), level=1, sock=sys.stdout) if self.recurse: # Remove matches from dist array, and rerun munkres # Repeat until dist array is empty pass else: for i in range(matches.shape[1]): if not matchedy[i]: ldist += min(matches[:, i]) ldist /= max(dist.shape) # TODO: # Dist penalty is at most beta # The problem with this is that there may be a better pairing between the two sentences # if you optimize for mindist with dist penalty. # Also could add a weight to each pairing like IDF, most important for the # summing, but a different sum would again potentially affect the optimal # match. beta = 1 self.kdist = tdist * (1 + beta * (kt + 1) / 2) self.kldist = ldist * (1 + beta * (kt + 1) / 2) self.ldist = ldist #print "Score: %g" % tsim #print "Label: %g" % self.pair.label self.tsim = 1 - tdist self.tmaxidf = tmaxidf self.tsumidf = tsumidf self.nmatches = nmatches self.start = mstart self.end = mend self.kt = kt self.dists = sorted(dists, reverse=True) self.lsim = tdist + (max(dists) * (self.maxlen - self.minlen)) self.tmax = max(dists) self.tmin = max(dists) self.tsum = sum(dists) self.tstd = np.std(dists) return self.tsim
def match(self, pair): s1l = len(pair.s1["vector"]) s2l = len(pair.s2["vector"]) self.tsim = float('-9999') self.lsim = float('-9999') self.minlen = min(s1l, s2l) self.maxlen = max(s1l, s2l) self.nmatches = 0 self.start = -1 self.end = -1 if (self.minlen == 0 or self.maxlen >= 100): return self.tsim # For simplicity in later code, make the shorter one first if s1l < s2l: self.s1 = pair.s1 self.s2 = pair.s2 s1l = len(pair.s1["vector"]) s2l = len(pair.s2["vector"]) else: self.s1 = pair.s2 self.s2 = pair.s1 wc = self.wordcount if "wv_idfs" not in self.s1: self.s1["wv_idfs"] = [math.log(wc / self.vocab[x], 2) for x in self.s1["wv_tokens"]] if "wv_idfs" not in self.s2: self.s2["wv_idfs"] = [math.log(wc / self.vocab[x], 2) for x in self.s2["wv_tokens"]] if self.ngram > 1: ng = self.ngram v1 = self.s1["vector"] v2 = self.s2["vector"] t1 = self.s1["wv_tokens"] t2 = self.s2["wv_tokens"] #idf1 = self.s1["wv_idfs"] #idf2 = self.s2["wv_idfs"] weights1 = self.s1["weights"] weights2 = self.s2["weights"] nv1 = [sum(v1[i:i + ng]) for i in range(max(1, len(v1) - ng + 1))] nv2 = [sum(v2[i:i + ng]) for i in range(max(1, len(v2) - ng + 1))] nt1 = ["_".join(t1[i:i + ng]) for i in range(max(1, len(t1) - ng + 1))] nt2 = ["_".join(t2[i:i + ng]) for i in range(max(1, len(t2) - ng + 1))] #nidf1 = [max(idf1[i:i + ng]) for i in range(max(1, len(idf1) - ng + 1))] #nidf2 = [max(idf2[i:i + ng]) for i in range(max(1, len(idf2) - ng + 1))] nweights1 = [max(weights1[i:i + ng]) for i in range(max(1, len(weights1) - ng + 1))] nweights2 = [max(weights2[i:i + ng]) for i in range(max(1, len(weights2) - ng + 1))] #self.s1 = {"vector": nv1, "wv_tokens": nt1, "wv_idfs": nidf1} #self.s2 = {"vector": nv2, "wv_tokens": nt2, "wv_idfs": nidf2} self.s1 = {"vector": nv1, "wv_tokens": nt1, "weights": nweights1} self.s2 = {"vector": nv2, "wv_tokens": nt2, "weights": nweights2} self.minlen = max(self.minlen - ng + 1, 1) self.maxlen = max(self.maxlen - ng + 1, 1) self.dists = [1] * self.minlen self.pair = pair #self.dist = pairdist(self.s1["vector"], self.s2["vector"], fn=self.metric) #self.dist = pairdist(self.s1, self.s2, fn=self.metric) dist = self.metric(self.s1, self.s2) # scale by max of idf #for i in range(dist.shape[0]): # for j in range(dist.shape[1]): # dist[i][j] *= max(self.s1["wv_idfs"][i], self.s2["wv_idfs"][j]) self.matchv = np.zeros(dist.shape, int) np.fill_diagonal(self.matchv, 1) if np.sum(dist) == 0: self.tsim = 1 self.nmatches = min(dist.shape) self.start = 0 self.end = dist.shape[1] - 1 return self.tsim if (dist == dist[0]).all(): self.tsim = 1 - sum(dist[0]) self.nmatches = min(dist.shape) self.start = 0 self.end = dist.shape[1] - 1 return self.tsim if (dist.T == dist[:, 0]).all(): self.tsim = 1 - sum(dist[:, 0]) self.nmatches = min(dist.shape) self.start = 0 self.end = dist.shape[1] - 1 return self.tsim signal.signal(signal.SIGALRM, munkres_handler) signal.alarm(10) try: matches = munkres(dist) except Exception, e: printd(e) printd("dist: " + dist.shape) printd(dist) return self.tsim
def main(args): global wordvec, wordvecf conf.debug = args.debug or args.verbose conf.verbose = args.verbose conf.args = args #nf = args.nuggets #uf = args.updates #mf = args.matches sf = args.shingles vf = args.wordvec #ef = args.evalfile wvout = args.wvfile sim_thr = args.sim_thr dset = args.dataset limit = args.limit #if args.dataset == "auto": # if ef is not None: # dset = "semeval" # else: # with open(glob.glob(nf)[0]) as nh: # nfhead = nh.readline() # if nfhead.startswith("query_id\tnugget_id"): # dset = "ts" # elif nfhead.startswith("query_id\tvs_id"): # dset = "mclick" # else: # dset = "1click" if os.path.exists(wvout) and not args.force: wordvecf = wvout if vf: printd("Reading word vector...") #wordvec = load_wordvec() wordvec = WordVec(wordvecf) if args.sim == "minsim": matcher = MinDistSim elif args.sim == "infsim": matcher = InfSim else: matcher = VecSim if args.sim == "infsim" or args.comparator == "infsim": wordvec.normalize() #if dset == "ts": # nuggfn = Nuggets # updfn = Updates # outfn = MatchWriter #elif dset == "1click": # nuggfn = CLNuggets # updfn = CLUpdates # outfn = CLMatchWriter #elif dset == "mclick": # nuggfn = MCNuggets # updfn = Updates # outfn = MCMatchWriter #elif dset == "semeval": # data = SemEvalDataset(args.input_data, args.evalfile) # outfn = data.writer # if vf is not None: # data.vectorize(wordvec) #else: # nuggfn = MCNuggets # updfn = Updates # outfn = MCMatchWriter data = Dataset.load(args.input_data, dset) if vf is not None: data.vectorize(wordvec) #if dset == "semeval": # data = SemEvalDataset(args.input_data, args.evalfile) # #outfn = data.writer # if vf is not None: # data.vectorize(wordvec) #else: # printd("Processing Nuggets...") # #nuggets = nuggfn(nf, vectorize=vf is not None) # printd("Processing Updates...") # #updates = updfn(uf, vectorize=vf is not None) # #data = NuggetDataset(nuggets, updates, mf) # data = NuggetDataset(nf, uf, mf, dset=dset, vectorize=vf is not None) if vf and wvout is not None and wvout != wordvecf: printd("Rereading word vectors to optimize...") wv_toks = data.wv_sentences() #if dset == "semeval": # wv_toks = data.wv_sentences() #else: # wv_toks = nuggets.wv_text() + updates.wv_text() wordvec = WordVec(wordvecf, sentences=wv_toks, wvout=wvout, size=wordvec.originalsize) if args.sim == "infsim" or args.comparator == "infsim": wordvec.normalize() data.vectorize(wordvec) with open(wvout + ".vocab", 'w') as wh: wh.write("\n".join(wordvec.vocab().keys())) with open(wvout + ".toks", 'w') as wh: wh.write("\n".join([" ".join(x) for x in wv_toks])) #vocab = nuggets.wv_vocab().union(updates.wv_vocab()) #wordvec.trim(lambda word, count, min_count: gensim.utils.RULE_KEEP if word in vocab else gensim.utils.RULE_DISCARD) #wordvec.save(wvout) vocab = None if args.frequencies: try: with open(args.frequencies) as fh: vocab = json.load(fh) # For Term Frequencies instead of Document Frequencies # Could also do len(vocab[word]) if wanted to mimic DF if type(vocab.itervalues().next()) == dict: for word in vocab: vocab[word] = sum(vocab[word].itervalues()) except Exception: pass if vocab is None: vocab = data.wv_vocab() logdf = wordvec.logdf(vocab) logdffile = wordvecf + ".logdf" #if not os.path.exists(logdffile) or (os.path.getmtime(logdffile) < os.path.getmtime(wordvecf)): # np.savetxt(logdffile, logdf, delimiter=" ", fmt="%g") np.savetxt(logdffile, logdf, delimiter=" ", fmt="%g") if args.comparator == "infsim" and args.sim != "infsim": comparator = InfSim(logdf).pairwisedist else: comparator = args.comparator matcher = matcher(df=logdf, metric=comparator) data.normalize(matcher, logdf) printd("Finding matches...") matches = [] with data.writer(sf) as sw, data.writer(vf) as vw: mcnt = 0 timer = Timer() for pair in data.test(): if sf: match = shingle(pair.s1["tokens"], pair.s2["tokens"]) if match.score >= min_score: sw.write(pair, match) if vf: printd("Matching pair %s" % (pair.pid), level=1) try: sim = matcher.match(pair) matches.append((matcher.tsim, unicode(matcher))) except ValueError, err: printd(err) sim = sim_thr printd("Match %0.4f for %s, %s" % (sim, pair.sid1, pair.sid2)) if sim < sim_thr: sim = sim_thr start = matcher.start end = matcher.end - matcher.start else: start = -1 end = len(pair.s2["tokens"]) - 1 match = Match(sim, start, end) vw.write(pair, match) mcnt += 1 if (mcnt % 100000) == 0: print >>sys.stderr, "%g tmps" % (100 / timer.mark()) if limit and mcnt >= limit: return if conf.verbose: for tsim, match in sorted(matches): print match