def processData(args): data = dataset.Dataset.load(args.input_data, args.dataset) wvout = args.wvfile if os.path.exists(wvout): wordvecf = wvout else: wordvecf = args.wvsource features = {x for x in args.basefeatures.split(',') if x != ''} matchers = {x for x in args.matchers.split(',') if x != ''} printd("Loading Word Vectors") wordvec = WordVec(wordvecf) printd("Vectorizing") data.vectorize(wordvec) maxwords = data.maxShortSentence() if wvout != wordvecf: printd("Rereading word vectors to optimize...") wv_toks = data.wv_sentences() wordvec = WordVec(wordvecf, sentences=wv_toks, wvout=wvout, size=wordvec.originalsize) data.vectorize(wordvec) conf.wvsize = wordvec.size # Train data printd("Computing basic WV Features") fs = FeatureSet(data, features) if "Pair" in matchers: printd("Computing Pair Features") matcher = vectorsim.PairFeatures(dimfeatures=args.dimfeatures) fs.addMatcher(matcher) if "Shingle" in matchers: printd("Computing Shingle Features") matcher = Shingler(slop=12, lmbda=0.95) fs.addMatcher(matcher) vocab = None if "MinDistSim" in matchers: printd("Computing MinDist") vocab = fs.data.wv_vocab() data.weight() comparator = 'cosine' matcher = vectorsim.MinDistSim(metric=comparator, df=vocab, maxsent=maxwords, dimfeatures=args.dimfeatures) fs.addMatcher(matcher, 'cos') printd("Computing MinDist-Euclidean") comparator = 'euclidean' matcher = vectorsim.MinDistSim(metric=comparator, df=vocab, maxsent=maxwords, dimfeatures=args.dimfeatures) fs.addMatcher(matcher, 'euc') if "NGram" in matchers: printd("Computing MinDist-Ngram") vocab = fs.data.wv_vocab() if vocab is None: vocab = fs.data.wv_vocab() comparator = 'cosine' matcher = vectorsim.MinDistSim(metric=comparator, df=vocab, maxsent=maxwords, ngram=2, dimfeatures=args.dimfeatures) fs.addMatcher(matcher, 'cos-bigram') comparator = 'cosine' matcher = vectorsim.MinDistSim(metric=comparator, df=vocab, maxsent=maxwords, ngram=3, dimfeatures=args.dimfeatures) fs.addMatcher(matcher, 'cos-trigram') if "WWSim" in matchers: printd("Computing WWSim") matcher = vectorsim.WWSim(wordvec=wordvec, dimfeatures=args.dimfeatures) fs.addMatcher(matcher) if "InfRankSim" in matchers: printd("Computing InfRankSim") matcher = vectorsim.InfRankSim(data=data, wordvec=wordvec, dimfeatures=args.dimfeatures) printd("InfRankSim Matching") fs.addMatcher(matcher) if "InfSim" in matchers: # We normalize after so primary features are raw word vectors # InfSim printd("Computing InfSim") wordvec.normalize() data.vectorize(wordvec) matcher = vectorsim.InfSim(data=data, wordvec=wordvec, dimfeatures=args.dimfeatures) fs.addMatcher(matcher) return fs
def main(args): global wordvec, wordvecf conf.debug = args.debug or args.verbose conf.verbose = args.verbose conf.args = args #nf = args.nuggets #uf = args.updates #mf = args.matches sf = args.shingles vf = args.wordvec #ef = args.evalfile wvout = args.wvfile sim_thr = args.sim_thr dset = args.dataset limit = args.limit #if args.dataset == "auto": # if ef is not None: # dset = "semeval" # else: # with open(glob.glob(nf)[0]) as nh: # nfhead = nh.readline() # if nfhead.startswith("query_id\tnugget_id"): # dset = "ts" # elif nfhead.startswith("query_id\tvs_id"): # dset = "mclick" # else: # dset = "1click" if os.path.exists(wvout) and not args.force: wordvecf = wvout if vf: printd("Reading word vector...") #wordvec = load_wordvec() wordvec = WordVec(wordvecf) if args.sim == "minsim": matcher = MinDistSim elif args.sim == "infsim": matcher = InfSim else: matcher = VecSim if args.sim == "infsim" or args.comparator == "infsim": wordvec.normalize() #if dset == "ts": # nuggfn = Nuggets # updfn = Updates # outfn = MatchWriter #elif dset == "1click": # nuggfn = CLNuggets # updfn = CLUpdates # outfn = CLMatchWriter #elif dset == "mclick": # nuggfn = MCNuggets # updfn = Updates # outfn = MCMatchWriter #elif dset == "semeval": # data = SemEvalDataset(args.input_data, args.evalfile) # outfn = data.writer # if vf is not None: # data.vectorize(wordvec) #else: # nuggfn = MCNuggets # updfn = Updates # outfn = MCMatchWriter data = Dataset.load(args.input_data, dset) if vf is not None: data.vectorize(wordvec) #if dset == "semeval": # data = SemEvalDataset(args.input_data, args.evalfile) # #outfn = data.writer # if vf is not None: # data.vectorize(wordvec) #else: # printd("Processing Nuggets...") # #nuggets = nuggfn(nf, vectorize=vf is not None) # printd("Processing Updates...") # #updates = updfn(uf, vectorize=vf is not None) # #data = NuggetDataset(nuggets, updates, mf) # data = NuggetDataset(nf, uf, mf, dset=dset, vectorize=vf is not None) if vf and wvout is not None and wvout != wordvecf: printd("Rereading word vectors to optimize...") wv_toks = data.wv_sentences() #if dset == "semeval": # wv_toks = data.wv_sentences() #else: # wv_toks = nuggets.wv_text() + updates.wv_text() wordvec = WordVec(wordvecf, sentences=wv_toks, wvout=wvout, size=wordvec.originalsize) if args.sim == "infsim" or args.comparator == "infsim": wordvec.normalize() data.vectorize(wordvec) with open(wvout + ".vocab", 'w') as wh: wh.write("\n".join(wordvec.vocab().keys())) with open(wvout + ".toks", 'w') as wh: wh.write("\n".join([" ".join(x) for x in wv_toks])) #vocab = nuggets.wv_vocab().union(updates.wv_vocab()) #wordvec.trim(lambda word, count, min_count: gensim.utils.RULE_KEEP if word in vocab else gensim.utils.RULE_DISCARD) #wordvec.save(wvout) vocab = None if args.frequencies: try: with open(args.frequencies) as fh: vocab = json.load(fh) # For Term Frequencies instead of Document Frequencies # Could also do len(vocab[word]) if wanted to mimic DF if type(vocab.itervalues().next()) == dict: for word in vocab: vocab[word] = sum(vocab[word].itervalues()) except Exception: pass if vocab is None: vocab = data.wv_vocab() logdf = wordvec.logdf(vocab) logdffile = wordvecf + ".logdf" #if not os.path.exists(logdffile) or (os.path.getmtime(logdffile) < os.path.getmtime(wordvecf)): # np.savetxt(logdffile, logdf, delimiter=" ", fmt="%g") np.savetxt(logdffile, logdf, delimiter=" ", fmt="%g") if args.comparator == "infsim" and args.sim != "infsim": comparator = InfSim(logdf).pairwisedist else: comparator = args.comparator matcher = matcher(df=logdf, metric=comparator) data.normalize(matcher, logdf) printd("Finding matches...") matches = [] with data.writer(sf) as sw, data.writer(vf) as vw: mcnt = 0 timer = Timer() for pair in data.test(): if sf: match = shingle(pair.s1["tokens"], pair.s2["tokens"]) if match.score >= min_score: sw.write(pair, match) if vf: printd("Matching pair %s" % (pair.pid), level=1) try: sim = matcher.match(pair) matches.append((matcher.tsim, unicode(matcher))) except ValueError, err: printd(err) sim = sim_thr printd("Match %0.4f for %s, %s" % (sim, pair.sid1, pair.sid2)) if sim < sim_thr: sim = sim_thr start = matcher.start end = matcher.end - matcher.start else: start = -1 end = len(pair.s2["tokens"]) - 1 match = Match(sim, start, end) vw.write(pair, match) mcnt += 1 if (mcnt % 100000) == 0: print >> sys.stderr, "%g tmps" % (100 / timer.mark()) if limit and mcnt >= limit: return if conf.verbose: for tsim, match in sorted(matches): print match
def main(args): global wordvec, wordvecf conf.debug = args.debug or args.verbose conf.verbose = args.verbose conf.args = args #nf = args.nuggets #uf = args.updates #mf = args.matches sf = args.shingles vf = args.wordvec #ef = args.evalfile wvout = args.wvfile sim_thr = args.sim_thr dset = args.dataset limit = args.limit #if args.dataset == "auto": # if ef is not None: # dset = "semeval" # else: # with open(glob.glob(nf)[0]) as nh: # nfhead = nh.readline() # if nfhead.startswith("query_id\tnugget_id"): # dset = "ts" # elif nfhead.startswith("query_id\tvs_id"): # dset = "mclick" # else: # dset = "1click" if os.path.exists(wvout) and not args.force: wordvecf = wvout if vf: printd("Reading word vector...") #wordvec = load_wordvec() wordvec = WordVec(wordvecf) if args.sim == "minsim": matcher = MinDistSim elif args.sim == "infsim": matcher = InfSim else: matcher = VecSim if args.sim == "infsim" or args.comparator == "infsim": wordvec.normalize() #if dset == "ts": # nuggfn = Nuggets # updfn = Updates # outfn = MatchWriter #elif dset == "1click": # nuggfn = CLNuggets # updfn = CLUpdates # outfn = CLMatchWriter #elif dset == "mclick": # nuggfn = MCNuggets # updfn = Updates # outfn = MCMatchWriter #elif dset == "semeval": # data = SemEvalDataset(args.input_data, args.evalfile) # outfn = data.writer # if vf is not None: # data.vectorize(wordvec) #else: # nuggfn = MCNuggets # updfn = Updates # outfn = MCMatchWriter data = Dataset.load(args.input_data, dset) if vf is not None: data.vectorize(wordvec) #if dset == "semeval": # data = SemEvalDataset(args.input_data, args.evalfile) # #outfn = data.writer # if vf is not None: # data.vectorize(wordvec) #else: # printd("Processing Nuggets...") # #nuggets = nuggfn(nf, vectorize=vf is not None) # printd("Processing Updates...") # #updates = updfn(uf, vectorize=vf is not None) # #data = NuggetDataset(nuggets, updates, mf) # data = NuggetDataset(nf, uf, mf, dset=dset, vectorize=vf is not None) if vf and wvout is not None and wvout != wordvecf: printd("Rereading word vectors to optimize...") wv_toks = data.wv_sentences() #if dset == "semeval": # wv_toks = data.wv_sentences() #else: # wv_toks = nuggets.wv_text() + updates.wv_text() wordvec = WordVec(wordvecf, sentences=wv_toks, wvout=wvout, size=wordvec.originalsize) if args.sim == "infsim" or args.comparator == "infsim": wordvec.normalize() data.vectorize(wordvec) with open(wvout + ".vocab", 'w') as wh: wh.write("\n".join(wordvec.vocab().keys())) with open(wvout + ".toks", 'w') as wh: wh.write("\n".join([" ".join(x) for x in wv_toks])) #vocab = nuggets.wv_vocab().union(updates.wv_vocab()) #wordvec.trim(lambda word, count, min_count: gensim.utils.RULE_KEEP if word in vocab else gensim.utils.RULE_DISCARD) #wordvec.save(wvout) vocab = None if args.frequencies: try: with open(args.frequencies) as fh: vocab = json.load(fh) # For Term Frequencies instead of Document Frequencies # Could also do len(vocab[word]) if wanted to mimic DF if type(vocab.itervalues().next()) == dict: for word in vocab: vocab[word] = sum(vocab[word].itervalues()) except Exception: pass if vocab is None: vocab = data.wv_vocab() logdf = wordvec.logdf(vocab) logdffile = wordvecf + ".logdf" #if not os.path.exists(logdffile) or (os.path.getmtime(logdffile) < os.path.getmtime(wordvecf)): # np.savetxt(logdffile, logdf, delimiter=" ", fmt="%g") np.savetxt(logdffile, logdf, delimiter=" ", fmt="%g") if args.comparator == "infsim" and args.sim != "infsim": comparator = InfSim(logdf).pairwisedist else: comparator = args.comparator matcher = matcher(df=logdf, metric=comparator) data.normalize(matcher, logdf) printd("Finding matches...") matches = [] with data.writer(sf) as sw, data.writer(vf) as vw: mcnt = 0 timer = Timer() for pair in data.test(): if sf: match = shingle(pair.s1["tokens"], pair.s2["tokens"]) if match.score >= min_score: sw.write(pair, match) if vf: printd("Matching pair %s" % (pair.pid), level=1) try: sim = matcher.match(pair) matches.append((matcher.tsim, unicode(matcher))) except ValueError, err: printd(err) sim = sim_thr printd("Match %0.4f for %s, %s" % (sim, pair.sid1, pair.sid2)) if sim < sim_thr: sim = sim_thr start = matcher.start end = matcher.end - matcher.start else: start = -1 end = len(pair.s2["tokens"]) - 1 match = Match(sim, start, end) vw.write(pair, match) mcnt += 1 if (mcnt % 100000) == 0: print >>sys.stderr, "%g tmps" % (100 / timer.mark()) if limit and mcnt >= limit: return if conf.verbose: for tsim, match in sorted(matches): print match