def build(self, p): i = 0 tt = ticktock("builder") tt.tick("building") for line in open(p): sline = line[:-1].split("\t") fb_id = sline[0] triplecount = int(sline[1]) + int(sline[2]) name = self.processor.processline(sline[3]) type_id = sline[4] type_id = type_id if type_id != "<UNK>" else None type_name = " ".join(tokenize(sline[5])) type_name = type_name if type_name != " ".join( tokenize("<UNK>")) else None if name not in self.indexdict: self.indexdict[name] = [] self.indexdict[name].append({ "fb_id": fb_id, "triplecount": triplecount, "type_id": type_id, "type_name": type_name }) i += 1 if i % 1000 == 0: tt.live("{}k".format(i // 1000)) tt.tock("built")
def getdata(p, worddic, chardic, entdic, reldic, maxc=np.infty, maxchar=30): data = [] gold = [] maxlen = 0 maxwordlen = 0 c = 0 for line in open(p): q, a = (line[:-1] if line[-1] == "\n" else line).split("\t") s, p = a.split() words = tokenize(q) maxlen = max(maxlen, len(words)) for word in words: maxwordlen = max(maxwordlen, len(word)) if word not in worddic: worddic[word] = len(worddic) if s not in entdic: entdic[s] = len(entdic) if p not in reldic: reldic[p] = len(reldic) data.append(words) gold.append([entdic[s], reldic[p]]) c += 1 if c > maxc: break print maxwordlen maxchar = min(maxchar, maxwordlen) wordmat = np.zeros((c, maxlen)).astype("int32") - 1 charten = np.zeros((c, maxlen, maxchar)).astype("int32") - 1 goldmat = np.zeros((c, 2)).astype("int32") i = 0 for sent in data: j = 0 for word in sent: if len(word) > maxchar: print word wordmat[i, j] = worddic[word] chars = map(ord, word) charten[i, j, :min(len(chars), maxchar)] = chars[:min(len(chars), maxchar)] j += 1 i += 1 i = 0 for x in gold: goldmat[i, :] = x i += 1 # making chardic and transforming through chardic thischardic = dict(map(lambda (x,y): (ord(x), y), chardic.items())) nextid = 0 while nextid in thischardic.values(): nextid += 1 uniquechars = np.unique(charten) for uniquechar in list(uniquechars): if not uniquechar in thischardic and uniquechar >= 0: thischardic[uniquechar] = nextid while nextid in thischardic.values(): nextid += 1 chardic.update(dict(map(lambda (x, y): (chr(x), y), thischardic.items()))) print len(chardic), chardic charten = np.vectorize(lambda x: thischardic[x] if x >= 0 else x)(charten) datamat = np.concatenate([wordmat.reshape(wordmat.shape + (1,)), charten], axis=2) return datamat, goldmat
def getdata(p, maxc=np.infty): data = [] gold = [] maxlen = 0 c = 0 for line in open(p): question, answer = line[:-1].split("\t") subject, predicate = answer.split(" ") question_words = tokenize(question) addwords(*question_words) maxlen = max(maxlen, len(question_words)) if subject not in entdic: add_entity_wo_label(subject) if predicate not in reldic: raise Exception("predicate should be there") wordidx = [worddic[x] if x in worddic else worddic["<RARE>"] for x in question_words] data.append(wordidx) gold.append([entdic[subject], reldic[predicate]]) c += 1 if c % 100 == 0: print c if c > maxc: break datamat = np.zeros((c, maxlen)).astype("int32") - 1 goldmat = np.zeros((c, 2)).astype("int32") i = 0 for x in data: datamat[i, :len(x)] = x i += 1 i = 0 for x in gold: goldmat[i, :] = x i += 1 return datamat, goldmat
def getdata(p, maxc=np.infty): data = [] gold = [] maxlen = 0 c = 0 for line in open(p): q, a = (line[:-1] if line[-1] == "\n" else line).split("\t") s, p = a.split() chars = " ".join(tokenize(q)) updatechardic(*set(chars)) maxlen = max(maxlen, len(chars)) if s not in entdic: updateentk(s) if p not in reldic: raise Exception("impossibru!") charidx = map(lambda x: chardic[x] if x in chardic else chardic[" "], chars) data.append(charidx) gold.append([entdic[s], reldic[p]]) c += 1 if c % 100 == 0: print c if c > maxc: break datamat = np.zeros((c, maxlen)).astype("int32") - 1 goldmat = np.zeros((c, 2)).astype("int32") i = 0 for x in data: datamat[i, :len(x)] = x i += 1 i = 0 for x in gold: goldmat[i, :] = x i += 1 return datamat, goldmat
def stemmedprocessline(self, x): x = x.replace("'s", "") x = x.replace("' s", "") x = x.replace("'", "") tokens = tokenize(x) #print tokens stokens = [self.stemmer.stem(token) for token in tokens] return " ".join(stokens)
def run(labelp="labels.map", datap="datamat.word.pkl"): labeldic = loadlabels(labelp) glove = Glove(50) print "the" in glove len(labeldic) x = pickle.load(open(datap)) print len(x["train"][0]) # get entities without labels allentids = sorted(x["entdic"].items(), key=lambda (a, b): b)[:x["numents"]] print allentids[:20] entidsnotinlabeldic = set(map(lambda (a, b): a, allentids)).difference(set(labeldic.keys())) print len(entidsnotinlabeldic), list(entidsnotinlabeldic)[:20] wd = {v: k for k, v in x["worddic"].items()} ed = {v: k for k, v in x["entdic"].items()} alltrainents = set(x["train"][1][:, 0]) allvalidents = set(x["valid"][1][:, 0]) alltestents = set(x["test"][1][:, 0]) print "%d/%d (%.2f%%) test set entities not in training " % ( len(alltestents.difference(alltrainents)), len(alltestents), len(alltestents.difference(alltrainents)) * 100. / len(alltestents)) print "%d/%d (%.2f%%) validation set entities not in training " % ( len(allvalidents.difference(alltrainents)), len(allvalidents), len(allvalidents.difference(alltrainents)) * 100. / len(allvalidents)) #print len(allvalidents.difference(alltrainents)) # gather all words in entity labels labelwords = {} labelwordsnotinglove = set() for label in labeldic.values(): for labelw in tokenize(label): if labelw not in labelwords: labelwords[labelw] = 0 labelwords[labelw] += 1 if labelw not in glove: labelwordsnotinglove.add(labelw) print "%d unique words in labels" % len(labelwords) print "%d words not in glove" % len(labelwordsnotinglove) for split in ["train", "test", "valid"]: #print "SPLIT %s" % split split = x[split] #print len(split[0]) wocount = 0 for i in range(len(split[0])): #print ed[split[1][i][0]], ed[split[1][i][0]] in entidsnotinlabeldic #break if ed[split[1][i][0]] in entidsnotinlabeldic: #print " ".join(map(lambda x: wd[x] if x in wd else "", list(split[0][i]))), ed[split[1][i][0]] wocount += 1 else: pass
def run(labelp="labels.map", datap="datamat.word.pkl"): labeldic = loadlabels(labelp) glove = Glove(50) print "the" in glove len(labeldic) x = pickle.load(open(datap)) print len(x["train"][0]) # get entities without labels allentids = sorted(x["entdic"].items(), key=lambda (a, b): b)[:x["numents"]] print allentids[:20] entidsnotinlabeldic = set(map(lambda (a, b): a, allentids)).difference(set(labeldic.keys())) print len(entidsnotinlabeldic), list(entidsnotinlabeldic)[:20] wd = {v: k for k, v in x["worddic"].items()} ed = {v: k for k, v in x["entdic"].items()} alltrainents = set(x["train"][1][:, 0]) allvalidents = set(x["valid"][1][:, 0]) alltestents = set(x["test"][1][:, 0]) print "%d/%d (%.2f%%) test set entities not in training " % (len(alltestents.difference(alltrainents)), len(alltestents), len(alltestents.difference(alltrainents))*100./len(alltestents)) print "%d/%d (%.2f%%) validation set entities not in training " % ( len(allvalidents.difference(alltrainents)), len(allvalidents), len(allvalidents.difference(alltrainents)) * 100. / len(allvalidents)) #print len(allvalidents.difference(alltrainents)) # gather all words in entity labels labelwords = {} labelwordsnotinglove = set() for label in labeldic.values(): for labelw in tokenize(label): if labelw not in labelwords: labelwords[labelw] = 0 labelwords[labelw] += 1 if labelw not in glove: labelwordsnotinglove.add(labelw) print "%d unique words in labels" % len(labelwords) print "%d words not in glove" % len(labelwordsnotinglove) for split in ["train", "test", "valid"]: #print "SPLIT %s" % split split = x[split] #print len(split[0]) wocount = 0 for i in range(len(split[0])): #print ed[split[1][i][0]], ed[split[1][i][0]] in entidsnotinlabeldic #break if ed[split[1][i][0]] in entidsnotinlabeldic: #print " ".join(map(lambda x: wd[x] if x in wd else "", list(split[0][i]))), ed[split[1][i][0]] wocount += 1 else: pass
def getdata(p, chardic, entdic, reldic, maxc=np.infty): data = [] gold = [] maxlen = 0 c = 0 for line in open(p): q, a = (line[:-1] if line[-1] == "\n" else line).split("\t") s, p = a.split() words = tokenize(q) q = " ".join(words) maxlen = max(maxlen, len(q)) chars = map(ord, q) if len(set(chars).intersection({123})) > 0: pass #print line, q if s not in entdic: entdic[s] = len(entdic) if p not in reldic: reldic[p] = len(reldic) data.append(chars) gold.append([entdic[s], reldic[p]]) c += 1 if c > maxc: break datamat = np.zeros((c, maxlen)).astype("int32") - 1 goldmat = np.zeros((c, 2)).astype("int32") i = 0 for x in data: datamat[i, :len(x)] = x i += 1 i = 0 for x in gold: goldmat[i, :] = x i += 1 # making chardic and transforming through chardic thischardic = dict(map(lambda (x,y): (ord(x), y), chardic.items())) nextid = 0 while nextid in thischardic.values(): nextid += 1 uniquechars = np.unique(datamat) for uniquechar in list(uniquechars): if not uniquechar in thischardic and uniquechar >= 0: thischardic[uniquechar] = nextid while nextid in thischardic.values(): nextid += 1 chardic.update(dict(map(lambda (x, y): (chr(x), y), thischardic.items()))) print len(chardic), chardic datamat = np.vectorize(lambda x: thischardic[x] if x >= 0 else x)(datamat) return datamat, goldmat
def getdata(p, chardic, entdic, reldic, maxc=np.infty): data = [] gold = [] maxlen = 0 c = 0 for line in open(p): q, a = (line[:-1] if line[-1] == "\n" else line).split("\t") s, p = a.split() words = tokenize(q) q = " ".join(words) maxlen = max(maxlen, len(q)) chars = map(ord, q) if len(set(chars).intersection({123})) > 0: pass #print line, q if s not in entdic: entdic[s] = len(entdic) if p not in reldic: reldic[p] = len(reldic) data.append(chars) gold.append([entdic[s], reldic[p]]) c += 1 if c > maxc: break datamat = np.zeros((c, maxlen)).astype("int32") - 1 goldmat = np.zeros((c, 2)).astype("int32") i = 0 for x in data: datamat[i, :len(x)] = x i += 1 i = 0 for x in gold: goldmat[i, :] = x i += 1 # making chardic and transforming through chardic thischardic = dict(map(lambda (x, y): (ord(x), y), chardic.items())) nextid = 0 while nextid in thischardic.values(): nextid += 1 uniquechars = np.unique(datamat) for uniquechar in list(uniquechars): if not uniquechar in thischardic and uniquechar >= 0: thischardic[uniquechar] = nextid while nextid in thischardic.values(): nextid += 1 chardic.update(dict(map(lambda (x, y): (chr(x), y), thischardic.items()))) print len(chardic), chardic datamat = np.vectorize(lambda x: thischardic[x] if x >= 0 else x)(datamat) return datamat, goldmat
def getmemdata(reldic, worddic, chardic, maxchar=30): # updates worddic with words found in relation rels = sorted(reldic.items(), key=lambda (x, y): y) rels = map(lambda (x, y): (tokenize(x), y), rels) allrelwords = set() allrelchars = set() maxlen = 0 maxwordlen = 0 prevc = -1 for rel, c in rels: assert (c - 1 == prevc) prevc = c maxlen = max(maxlen, len(rel)) for relw in rel: allrelwords.add(relw) maxwordlen = max(maxwordlen, len(relw)) for relwchar in relw: allrelchars.add(relwchar) maxchar = min(maxchar, maxwordlen) relwordsnotinworddic = allrelwords.difference(set(worddic.keys())) charsnotinchardic = allrelchars.difference(set(chardic.keys())) for rwniw in relwordsnotinworddic: worddic[rwniw] = len(worddic) nextid = 0 for cnic in charsnotinchardic: while nextid in chardic.values(): nextid += 1 chardic[cnic] = nextid wordmat = np.zeros((len(rels), maxlen)).astype("int32") - 1 charten = np.zeros((len(rels), maxlen, maxchar)).astype("int32") - 1 for rel, c in rels: wordmat[c, :len(rel)] = map(lambda x: worddic[x], rel) j = 0 for relw in rel: charten[c, j, :min(len(relw), maxchar)] = \ map(lambda x: chardic[x], relw[:min(len(relw), maxchar)]) j += 1 datamat = np.concatenate([wordmat.reshape(wordmat.shape + (1, )), charten], axis=2) return datamat
def getmemdata(reldic, worddic, chardic, maxchar=30): # updates worddic with words found in relation rels = sorted(reldic.items(), key=lambda (x, y): y) rels = map(lambda (x, y): (tokenize(x), y), rels) allrelwords = set() allrelchars = set() maxlen = 0 maxwordlen = 0 prevc = -1 for rel, c in rels: assert(c-1 == prevc) prevc = c maxlen = max(maxlen, len(rel)) for relw in rel: allrelwords.add(relw) maxwordlen = max(maxwordlen, len(relw)) for relwchar in relw: allrelchars.add(relwchar) maxchar = min(maxchar, maxwordlen) relwordsnotinworddic = allrelwords.difference(set(worddic.keys())) charsnotinchardic = allrelchars.difference(set(chardic.keys())) for rwniw in relwordsnotinworddic: worddic[rwniw] = len(worddic) nextid = 0 for cnic in charsnotinchardic: while nextid in chardic.values(): nextid += 1 chardic[cnic] = nextid wordmat = np.zeros((len(rels), maxlen)).astype("int32") - 1 charten = np.zeros((len(rels), maxlen, maxchar)).astype("int32") - 1 for rel, c in rels: wordmat[c, :len(rel)] = map(lambda x: worddic[x], rel) j = 0 for relw in rel: charten[c, j, :min(len(relw), maxchar)] = \ map(lambda x: chardic[x], relw[:min(len(relw), maxchar)]) j += 1 datamat = np.concatenate([wordmat.reshape(wordmat.shape + (1,)), charten], axis=2) return datamat
def getdata(p, maxc=np.infty): data = [] gold = [] maxlen = 0 c = 0 for line in open(p): q, a = (line[:-1] if line[-1] == "\n" else line).split("\t") s, p = a.split() words = tokenize(q) updatedics(*words) maxlen = max(maxlen, len(words)) if s not in entdic: updateentk(s) if p not in reldic: raise Exception("impossibru!") wordidx = map(lambda x: worddic[x] if x in worddic else worddic["<RARE>"], words) charsidx = [[chardic[x] if x in chardic else chardic[" "] for x in word] for word in words] data.append((wordidx, charsidx)) gold.append([entdic[s], reldic[p]]) c += 1 if c % 100 == 0: print c if c > maxc: break datamat = np.zeros((c, maxlen, maxnamewordlen+1)).astype("int32") - 1 goldmat = np.zeros((c, 2)).astype("int32") i = 0 for x, y in data: datamat[i, :len(x), 0] = x j = 0 for ye in y: xec = datamat[i, j, 1:maxnamewordlen+1] i += 1 i = 0 for x in gold: goldmat[i, :] = x i += 1 return datamat, goldmat
def getcharmemdata(reldic, chardic): rels = sorted(reldic.items(), key=lambda (x, y): y) rels = map(lambda (x, y): (" ".join(tokenize(x)), y), rels) maxlen = 0 prevc = -1 allrelchars = set() for rel, c in rels: assert(c-1 == prevc) prevc = c maxlen = max(maxlen, len(rel)) for relchar in rel: allrelchars.add(relchar) charsnotinchardic = allrelchars.difference(set(chardic.keys())) nextid = 0 for cnic in charsnotinchardic: while nextid in chardic.values(): nextid += 1 chardic[cnic] = nextid retmat = np.zeros((len(rels), maxlen)).astype("int32") - 1 for rel, k in rels: rel = [chardic[c] for c in rel] retmat[k, :len(rel)] = rel return retmat
def readdataset(p, wdic, maxlen=100): dataret = [] goldret = [] toolong = 0 realmaxlen = 0 with open(p) as f: data = csv.reader(f, delimiter=",") for row in data: rowelems = tokenize(row[2]) realmaxlen = max(realmaxlen, len(rowelems)) if len(rowelems) > maxlen: toolong += 1 for rowelem in set(rowelems): if rowelem not in wdic: wdic[rowelem] = len(wdic) dataret.append([wdic[x] for x in rowelems]) goldret.append(row[0]) print "{} comments were too long".format(toolong) maxlen = min(maxlen, realmaxlen) datamat = np.ones((len(dataret) - 1, maxlen)).astype("int32") * masksym for i in range(1, len(dataret)): datamat[i - 1, :min(len(dataret[i]), maxlen)] = dataret[i][:min(len(dataret[i]), maxlen)] return datamat, np.asarray(goldret[1:], dtype="int32"), wdic
def run(trainp="../fb_train.tsv", testp="../fb_test.tsv", validp="../fb_valid.tsv", outp="datamat.word.fb2m.pkl", entnames="../subjnames_fb2m.map", rellist="../predicates_fb2m.list", maxnamelen=30): # worddic worddic = {"<RARE>": 0} wordcounts = {"<RARE>": 0} def addwords(*words): # adds a word to the worddic for word in words: if word not in worddic: worddic[word] = len(worddic) if word not in wordcounts: wordcounts[word] = 0 wordcounts[word] += 1 # entity names entdic = {} entmatr = [] c = 0 # counter maxlen = 0 for line in open(entnames): entity_id, entity_label = line[:-1].split("\t") entity_label_tokens = tokenize(entity_label) entity_label_tokens = entity_label_tokens[:min(len(entity_label_tokens), maxnamelen)] # ensure taken entity label has no more than maximum allowed number of tokens maxlen = max(maxlen, len(entity_label_tokens)) addwords(*entity_label_tokens) entmatr.append(entity_label_tokens) if entity_id not in entdic: entdic[entity_id] = len(entdic) if c % 1e3 == 0: print("{}k".format(c/1e3)) c += 1 entswonames = set() def add_entity_wo_label(entid): assert(entid not in entdic) entdic[entid] = len(entdic) entswonames.add(entid) entmatr.append(["<RARE>"]) # relation uri's reldic = {} relmatr = [] for line in open(rellist): relation_uri = line[:-1] relation_uri_tokens = reluri_tokenize(relation_uri) relation_uri_tokens = relation_uri_tokens[:min(len(relation_uri_tokens), maxnamelen)] # ensure max len maxlen = max(maxlen, len(relation_uri_tokens)) addwords(*relation_uri_tokens) relmatr.append(relation_uri_tokens) if relation_uri not in reldic: reldic[relation_uri] = len(reldic) maxnamelen = min(maxlen, maxnamelen) print len(entdic), len(reldic), len(worddic), maxnamelen def getdata(p, maxc=np.infty): data = [] gold = [] maxlen = 0 c = 0 for line in open(p): question, answer = line[:-1].split("\t") subject, predicate = answer.split(" ") question_words = tokenize(question) addwords(*question_words) maxlen = max(maxlen, len(question_words)) if subject not in entdic: add_entity_wo_label(subject) if predicate not in reldic: raise Exception("predicate should be there") wordidx = [worddic[x] if x in worddic else worddic["<RARE>"] for x in question_words] data.append(wordidx) gold.append([entdic[subject], reldic[predicate]]) c += 1 if c % 100 == 0: print c if c > maxc: break datamat = np.zeros((c, maxlen)).astype("int32") - 1 goldmat = np.zeros((c, 2)).astype("int32") i = 0 for x in data: datamat[i, :len(x)] = x i += 1 i = 0 for x in gold: goldmat[i, :] = x i += 1 return datamat, goldmat traindata = getdata(trainp) validdata = getdata(validp) testdata = getdata(testp) # build ent mat entmat = np.zeros((len(entmatr), maxnamelen), dtype="int32") - 1 for i in range(len(entmatr)): x = entmatr[i] entmat[i, :len(x)] = [worddic[a] for a in x] # build rel mat relmat = np.zeros((len(relmatr), maxnamelen), dtype="int32") - 1 for i in range(len(relmatr)): x = relmatr[i] relmat[i, :len(x)] = [worddic[a] for a in x] # package entmat = np.concatenate([entmat, relmat], axis=0) numents = len(entdic) traindata[1][:, 1] += numents validdata[1][:, 1] += numents testdata[1][:, 1] += numents entdic.update({k: v+numents for k, v in reldic.items()}) # save acc = { "train": traindata, "valid": validdata, "test": testdata, "worddic": worddic, "entdic": entdic, "entmat": entmat, "numents": numents, "wordcounts": wordcounts, } print("{} entities without names in datasets".format(len(entswonames))) pickle.dump(acc, open(outp, "w"))
class SimpleQuestionsLabelIndex(object): def __init__(self, host="drogon", index="simplequestions_labels"): self.host = host self.indexp = index def index(self, labelp="labels.map"): es = elasticsearch.Elasticsearch(hosts=[self.host]) try: es.indices.delete(index=self.indexp) except Exception, e: print "could not delete index %s" % self.indexp es.indices.create( index=self.indexp, body={ "settings": { "index": { "analysis": { "filter": { "pstemmer": { "type": "porter_stem", "language": "_english_" } }, "char_filter": { "punctfil": { "type": "pattern_replace", "pattern": "[\W]", "replacement": "" } }, "analyzer": { "myana": { "tokenizer": "whitespace", "filter": [ "lowercase", "pstemmer", ] #,"char_filter": ["punctfil"] } } } } }, "mappings": { "labelmap": { "properties": { "label": { "type": "string", "analyzer": "myana", "fields": { "len": { "type": "token_count", "store": "yes", "analyzer": "myana" } } } } } } }) i = 1 for line in open(labelp): k, v = line[:-1].split("\t") vt = tokenize(v) es.index(index=self.indexp, doc_type="labelmap", id=i, body={ "label": " ".join(vt), "fbid": k }) if i % 1000 == 0: print i i += 1 print "indexed labels"
def searchsentence(self, s, top=None, topsize=None, exact=True): s = tokenize(s) ngrams = self.getallngrams(s, topsize) return self.searchallngrams(ngrams, top, exact=exact)
def __call__(self, x): ret = tokenize(self.labelfy(x[0])) ret = ret[: min(self.maxlen, len(ret))] return ret, x[1]
def __call__(self, x): ret = tokenize(self.labelfy(x[0])) ret = ret[:min(self.maxlen, len(ret))] return ret, x[1]
def run(trainp="fb_train.tsv", testp="fb_test.tsv", validp="fb_valid.tsv", outp="datamat.char.mem.fb2m.pkl", entnames="subjnames_fb2m.map", rellist="rels_fb2m.list", maxnamelen=100): # worddic chardic = {" ": 0} def updatechardic(*chars): for char in chars: if char not in chardic: chardic[char] = len(chardic) # process entity names and relation list entdic = {} entmatr = [] entswonames = set() c = 0 maxlen = 0 for line in open(entnames): e, n = line.split("\t") nt = " ".join(tokenize(n)) nt = nt[:min(len(nt), maxnamelen)] maxlen = max(maxlen, len(nt)) updatechardic(*set(nt)) entmatr.append(nt) if e not in entdic: entdic[e] = len(entdic) if c % 1e3 == 0: print "%.0fk" % (c/1e3) c += 1 def updateentk(*ents): #ents have not been seen during initial population ==> no titles for ent in ents: assert(ent not in entdic) entdic[ent] = len(entdic) entswonames.add(ent) entmatr.append([" "]) reldic = {} relmatr = [] for line in open(rellist): r = line[:-1] rt = " ".join(tokenize(r)) rt = rt[:min(len(rt), maxnamelen)] maxlen = max(maxlen, len(rt)) updatechardic(*set(rt)) relmatr.append(rt) r = "/" + r.replace(".", "/") if r not in reldic: reldic[r] = len(reldic) maxnamelen = min(maxlen, maxnamelen) print len(entdic), len(reldic), len(chardic), maxnamelen # process data def getdata(p, maxc=np.infty): data = [] gold = [] maxlen = 0 c = 0 for line in open(p): q, a = (line[:-1] if line[-1] == "\n" else line).split("\t") s, p = a.split() chars = " ".join(tokenize(q)) updatechardic(*set(chars)) maxlen = max(maxlen, len(chars)) if s not in entdic: updateentk(s) if p not in reldic: raise Exception("impossibru!") charidx = map(lambda x: chardic[x] if x in chardic else chardic[" "], chars) data.append(charidx) gold.append([entdic[s], reldic[p]]) c += 1 if c % 100 == 0: print c if c > maxc: break datamat = np.zeros((c, maxlen)).astype("int32") - 1 goldmat = np.zeros((c, 2)).astype("int32") i = 0 for x in data: datamat[i, :len(x)] = x i += 1 i = 0 for x in gold: goldmat[i, :] = x i += 1 return datamat, goldmat traindata = getdata(trainp) validdata = getdata(validp) testdata = getdata(testp) # build ent mat entmat = np.zeros((len(entmatr), maxnamelen), dtype="int32") - 1 for i in range(len(entmatr)): x = entmatr[i] entmat[i, :len(x)] = map(lambda a: chardic[a], x) # build rel mat relmat = np.zeros((len(relmatr), maxnamelen), dtype="int32") - 1 for i in range(len(relmatr)): x = relmatr[i] relmat[i, :len(x)] = map(lambda a: chardic[a], x) # pre-package tests: print entmat.shape[0], len(entdic) assert(entmat.shape[0] == len(entdic)) # package entmat = np.concatenate([entmat, relmat], axis=0) numents = len(entdic) traindata[1][:, 1] += numents validdata[1][:, 1] += numents testdata[1][:, 1] += numents reldic = {k: v+numents for k, v in reldic.items()} entdic.update(reldic) # save acc = { "train": traindata, "valid": validdata, "test": testdata, "chardic": chardic, "entdic": entdic, "entmat": entmat, "numents": numents } print "%d entities without names in datasets" % len(entswonames) pickle.dump(acc, open(outp, "w"))
def predict(self, data, entcans=None, relsperent=None, relcans=None, multiprune=-1): print multiprune assert (relsperent is None or relcans is None) assert (relsperent is not None or relcans is not None) assert (entcans is not None) self.encodequestions(data) rankedsubjs = self.ranksubjects(entcans) bestsubjs = [x[0][0] for x in rankedsubjs] if relcans is not None: rankedrels = self.rankrelations(relcans) bestrels = [x[0][0] for x in rankedrels] else: if multiprune <= 0: relcans = [ relsperent[bestsubj][0] if bestsubj in relsperent else [] for bestsubj in bestsubjs ] rankedrels = self.rankrelations(relcans) bestrels = [x[0][0] for x in rankedrels] else: print "multipruning !!!!!!!!!!!!!!!!!" topk = multiprune # TOP K !!!!!!!!!!!!!!!!!!!!!!!!!!!! # get relcans relcans = [] for subjranking in rankedsubjs: toplabel = None relcanse = [] i = 0 for subj, score in subjranking: subjlabel = " ".join( tokenize(self.subjinfo[subj][0]) if subj in self.subjinfo else []) topcan = None if toplabel is None: toplabel = subjlabel topcan = subj elif subjlabel == toplabel: topcan = subj elif i < topk: topcan = subj else: pass toadd = relsperent[topcan][ 0] if topcan in relsperent else [] relcanse.extend(toadd) i += 1 relcans.append(relcanse) # rank relations rankedrels = self.rankrelations(relcans) bestrels = [x[0][0] for x in rankedrels] # build ents per relation entsperrel = {} for ent, rels in relsperent.items(): for rel in rels[0]: if rel not in entsperrel: entsperrel[rel] = set() entsperrel[rel].add(ent) # filter rankedsubjs filteredrankedsubjs = [] for i in range(len(rankedsubjs)): filteredrankedsubjs.append([]) for subj, score in rankedsubjs[i]: if bestrels[i] in entsperrel and \ subj in entsperrel[bestrels[i]]: filteredrankedsubjs[i].append((subj, score)) if len(filteredrankedsubjs[i]) == 0: filteredrankedsubjs[i].append((-1, -1.)) bestsubjs = [x[0][0] for x in filteredrankedsubjs] ret = np.concatenate([ np.expand_dims(np.asarray(bestsubjs, dtype="int32"), axis=1), np.expand_dims(np.asarray(bestrels, dtype="int32"), axis=1) ], axis=1) return ret
def getwords(s): return tokenize(s)
def getdata(p, worddic, chardic, entdic, reldic, maxc=np.infty, maxchar=30): data = [] gold = [] maxlen = 0 maxwordlen = 0 c = 0 for line in open(p): q, a = (line[:-1] if line[-1] == "\n" else line).split("\t") s, p = a.split() words = tokenize(q) maxlen = max(maxlen, len(words)) for word in words: maxwordlen = max(maxwordlen, len(word)) if word not in worddic: worddic[word] = len(worddic) if s not in entdic: entdic[s] = len(entdic) if p not in reldic: reldic[p] = len(reldic) data.append(words) gold.append([entdic[s], reldic[p]]) c += 1 if c > maxc: break print maxwordlen maxchar = min(maxchar, maxwordlen) wordmat = np.zeros((c, maxlen)).astype("int32") - 1 charten = np.zeros((c, maxlen, maxchar)).astype("int32") - 1 goldmat = np.zeros((c, 2)).astype("int32") i = 0 for sent in data: j = 0 for word in sent: if len(word) > maxchar: print word wordmat[i, j] = worddic[word] chars = map(ord, word) charten[i, j, :min(len(chars), maxchar )] = chars[:min(len(chars), maxchar)] j += 1 i += 1 i = 0 for x in gold: goldmat[i, :] = x i += 1 # making chardic and transforming through chardic thischardic = dict(map(lambda (x, y): (ord(x), y), chardic.items())) nextid = 0 while nextid in thischardic.values(): nextid += 1 uniquechars = np.unique(charten) for uniquechar in list(uniquechars): if not uniquechar in thischardic and uniquechar >= 0: thischardic[uniquechar] = nextid while nextid in thischardic.values(): nextid += 1 chardic.update(dict(map(lambda (x, y): (chr(x), y), thischardic.items()))) print len(chardic), chardic charten = np.vectorize(lambda x: thischardic[x] if x >= 0 else x)(charten) datamat = np.concatenate([wordmat.reshape(wordmat.shape + (1, )), charten], axis=2) return datamat, goldmat
def run(trainp="fb_train.tsv", testp="fb_test.tsv", validp="fb_valid.tsv", outp="datamat.word.mem.fb2m.pkl", entnames="subjnames_fb2m.map", rellist="rels_fb2m.list", maxnamelen=30): # worddic worddic = {"<RARE>": 0} def updateworddic(*words): for word in words: if word not in worddic: worddic[word] = len(worddic) # process entity names and relation list entdic = {} entmatr = [] entswonames = set() c = 0 maxlen = 0 for line in open(entnames): e, n = line.split("\t") nt = tokenize(n) nt = nt[:min(len(nt), maxnamelen)] maxlen = max(maxlen, len(nt)) updateworddic(*nt) entmatr.append(nt) if e not in entdic: entdic[e] = len(entdic) if c % 1e3 == 0: print "%.0fk" % (c / 1e3) c += 1 def updateentk( *ents ): #ents have not been seen during initial population ==> no titles for ent in ents: assert (ent not in entdic) entdic[ent] = len(entdic) entswonames.add(ent) entmatr.append(["<RARE>"]) reldic = {} relmatr = [] for line in open(rellist): r = line[:-1] rt = tokenize(r) rt = rt[:min(len(rt), maxnamelen)] maxlen = max(maxlen, len(rt)) updateworddic(*rt) relmatr.append(rt) r = "/" + r.replace(".", "/") if r not in reldic: reldic[r] = len(reldic) maxnamelen = min(maxlen, maxnamelen) print len(entdic), len(reldic), len(worddic), maxnamelen # process data def getdata(p, maxc=np.infty): data = [] gold = [] maxlen = 0 c = 0 for line in open(p): q, a = (line[:-1] if line[-1] == "\n" else line).split("\t") s, p = a.split() words = tokenize(q) updateworddic(*words) maxlen = max(maxlen, len(words)) if s not in entdic: updateentk(s) if p not in reldic: raise Exception("impossibru!") wordidx = map( lambda x: worddic[x] if x in worddic else worddic["<RARE>"], words) data.append(wordidx) gold.append([entdic[s], reldic[p]]) c += 1 if c % 100 == 0: print c if c > maxc: break datamat = np.zeros((c, maxlen)).astype("int32") - 1 goldmat = np.zeros((c, 2)).astype("int32") i = 0 for x in data: datamat[i, :len(x)] = x i += 1 i = 0 for x in gold: goldmat[i, :] = x i += 1 return datamat, goldmat traindata = getdata(trainp) validdata = getdata(validp) testdata = getdata(testp) # build ent mat entmat = np.zeros((len(entmatr), maxnamelen), dtype="int32") - 1 for i in range(len(entmatr)): x = entmatr[i] entmat[i, :len(x)] = map(lambda a: worddic[a], x) # build rel mat relmat = np.zeros((len(relmatr), maxnamelen), dtype="int32") - 1 for i in range(len(relmatr)): x = relmatr[i] relmat[i, :len(x)] = map(lambda a: worddic[a], x) # pre-package tests: print entmat.shape[0], len(entdic) assert (entmat.shape[0] == len(entdic)) # package entmat = np.concatenate([entmat, relmat], axis=0) numents = len(entdic) traindata[1][:, 1] += numents validdata[1][:, 1] += numents testdata[1][:, 1] += numents reldic = {k: v + numents for k, v in reldic.items()} entdic.update(reldic) # save acc = { "train": traindata, "valid": validdata, "test": testdata, "worddic": worddic, "entdic": entdic, "entmat": entmat, "numents": numents } print "%d entities without names in datasets" % len(entswonames) pickle.dump(acc, open(outp, "w"))
def processline(self, x): return " ".join(tokenize(x))
def reluri_tokenize(reluri): return tokenize(reluri.replace("/", " ").replace("_", " "))