示例#1
0
 def build(self, p):
     i = 0
     tt = ticktock("builder")
     tt.tick("building")
     for line in open(p):
         sline = line[:-1].split("\t")
         fb_id = sline[0]
         triplecount = int(sline[1]) + int(sline[2])
         name = self.processor.processline(sline[3])
         type_id = sline[4]
         type_id = type_id if type_id != "<UNK>" else None
         type_name = " ".join(tokenize(sline[5]))
         type_name = type_name if type_name != " ".join(
             tokenize("<UNK>")) else None
         if name not in self.indexdict:
             self.indexdict[name] = []
         self.indexdict[name].append({
             "fb_id": fb_id,
             "triplecount": triplecount,
             "type_id": type_id,
             "type_name": type_name
         })
         i += 1
         if i % 1000 == 0:
             tt.live("{}k".format(i // 1000))
     tt.tock("built")
示例#2
0
def getdata(p, worddic, chardic, entdic, reldic, maxc=np.infty, maxchar=30):
    data = []
    gold = []
    maxlen = 0
    maxwordlen = 0
    c = 0
    for line in open(p):
        q, a = (line[:-1] if line[-1] == "\n" else line).split("\t")
        s, p = a.split()
        words = tokenize(q)
        maxlen = max(maxlen, len(words))
        for word in words:
            maxwordlen = max(maxwordlen, len(word))
            if word not in worddic:
                worddic[word] = len(worddic)
        if s not in entdic:
            entdic[s] = len(entdic)
        if p not in reldic:
            reldic[p] = len(reldic)
        data.append(words)
        gold.append([entdic[s], reldic[p]])
        c += 1
        if c > maxc:
            break
    print maxwordlen
    maxchar = min(maxchar, maxwordlen)
    wordmat = np.zeros((c, maxlen)).astype("int32") - 1
    charten = np.zeros((c, maxlen, maxchar)).astype("int32") - 1
    goldmat = np.zeros((c, 2)).astype("int32")
    i = 0
    for sent in data:
        j = 0
        for word in sent:
            if len(word) > maxchar:
                print word
            wordmat[i, j] = worddic[word]
            chars = map(ord, word)
            charten[i, j, :min(len(chars), maxchar)] = chars[:min(len(chars), maxchar)]
            j += 1
        i += 1
    i = 0
    for x in gold:
        goldmat[i, :] = x
        i += 1
    # making chardic and transforming through chardic
    thischardic = dict(map(lambda (x,y): (ord(x), y), chardic.items()))
    nextid = 0
    while nextid in thischardic.values():
        nextid += 1
    uniquechars = np.unique(charten)
    for uniquechar in list(uniquechars):
        if not uniquechar in thischardic and uniquechar >= 0:
            thischardic[uniquechar] = nextid
            while nextid in thischardic.values():
                nextid += 1
    chardic.update(dict(map(lambda (x, y): (chr(x), y), thischardic.items())))
    print len(chardic), chardic
    charten = np.vectorize(lambda x: thischardic[x] if x >= 0 else x)(charten)
    datamat = np.concatenate([wordmat.reshape(wordmat.shape + (1,)), charten], axis=2)
    return datamat, goldmat
示例#3
0
 def getdata(p, maxc=np.infty):
     data = []
     gold = []
     maxlen = 0
     c = 0
     for line in open(p):
         question, answer = line[:-1].split("\t")
         subject, predicate = answer.split(" ")
         question_words = tokenize(question)
         addwords(*question_words)
         maxlen = max(maxlen, len(question_words))
         if subject not in entdic:
             add_entity_wo_label(subject)
         if predicate not in reldic:
             raise Exception("predicate should be there")
         wordidx = [worddic[x] if x in worddic else worddic["<RARE>"] for x in question_words]
         data.append(wordidx)
         gold.append([entdic[subject], reldic[predicate]])
         c += 1
         if c % 100 == 0:
             print c
         if c > maxc:
             break
     datamat = np.zeros((c, maxlen)).astype("int32") - 1
     goldmat = np.zeros((c, 2)).astype("int32")
     i = 0
     for x in data:
         datamat[i, :len(x)] = x
         i += 1
     i = 0
     for x in gold:
         goldmat[i, :] = x
         i += 1
     return datamat, goldmat
示例#4
0
 def getdata(p, maxc=np.infty):
     data = []
     gold = []
     maxlen = 0
     c = 0
     for line in open(p):
         q, a = (line[:-1] if line[-1] == "\n" else line).split("\t")
         s, p = a.split()
         chars = " ".join(tokenize(q))
         updatechardic(*set(chars))
         maxlen = max(maxlen, len(chars))
         if s not in entdic:
             updateentk(s)
         if p not in reldic:
             raise Exception("impossibru!")
         charidx = map(lambda x: chardic[x] if x in chardic else chardic[" "], chars)
         data.append(charidx)
         gold.append([entdic[s], reldic[p]])
         c += 1
         if c % 100 == 0:
             print c
         if c > maxc:
             break
     datamat = np.zeros((c, maxlen)).astype("int32") - 1
     goldmat = np.zeros((c, 2)).astype("int32")
     i = 0
     for x in data:
         datamat[i, :len(x)] = x
         i += 1
     i = 0
     for x in gold:
         goldmat[i, :] = x
         i += 1
     return datamat, goldmat
示例#5
0
 def stemmedprocessline(self, x):
     x = x.replace("'s", "")
     x = x.replace("' s", "")
     x = x.replace("'", "")
     tokens = tokenize(x)
     #print tokens
     stokens = [self.stemmer.stem(token) for token in tokens]
     return " ".join(stokens)
示例#6
0
def run(labelp="labels.map", datap="datamat.word.pkl"):
    labeldic = loadlabels(labelp)
    glove = Glove(50)
    print "the" in glove
    len(labeldic)
    x = pickle.load(open(datap))
    print len(x["train"][0])
    # get entities without labels
    allentids = sorted(x["entdic"].items(), key=lambda
                       (a, b): b)[:x["numents"]]
    print allentids[:20]
    entidsnotinlabeldic = set(map(lambda (a, b): a,
                                  allentids)).difference(set(labeldic.keys()))
    print len(entidsnotinlabeldic), list(entidsnotinlabeldic)[:20]
    wd = {v: k for k, v in x["worddic"].items()}
    ed = {v: k for k, v in x["entdic"].items()}
    alltrainents = set(x["train"][1][:, 0])
    allvalidents = set(x["valid"][1][:, 0])
    alltestents = set(x["test"][1][:, 0])
    print "%d/%d (%.2f%%) test set entities not in training " % (
        len(alltestents.difference(alltrainents)), len(alltestents),
        len(alltestents.difference(alltrainents)) * 100. / len(alltestents))
    print "%d/%d (%.2f%%) validation set entities not in training " % (
        len(allvalidents.difference(alltrainents)), len(allvalidents),
        len(allvalidents.difference(alltrainents)) * 100. / len(allvalidents))
    #print len(allvalidents.difference(alltrainents))

    # gather all words in entity labels
    labelwords = {}
    labelwordsnotinglove = set()
    for label in labeldic.values():
        for labelw in tokenize(label):
            if labelw not in labelwords:
                labelwords[labelw] = 0
            labelwords[labelw] += 1
            if labelw not in glove:
                labelwordsnotinglove.add(labelw)

    print "%d unique words in labels" % len(labelwords)
    print "%d words not in glove" % len(labelwordsnotinglove)

    for split in ["train", "test", "valid"]:
        #print "SPLIT %s" % split
        split = x[split]
        #print len(split[0])
        wocount = 0
        for i in range(len(split[0])):
            #print ed[split[1][i][0]], ed[split[1][i][0]] in entidsnotinlabeldic
            #break
            if ed[split[1][i][0]] in entidsnotinlabeldic:
                #print " ".join(map(lambda x: wd[x] if x in wd else "", list(split[0][i]))), ed[split[1][i][0]]
                wocount += 1
            else:
                pass
示例#7
0
def run(labelp="labels.map", datap="datamat.word.pkl"):
    labeldic = loadlabels(labelp)
    glove = Glove(50)
    print "the" in glove
    len(labeldic)
    x = pickle.load(open(datap))
    print len(x["train"][0])
    # get entities without labels
    allentids = sorted(x["entdic"].items(), key=lambda (a, b): b)[:x["numents"]]
    print allentids[:20]
    entidsnotinlabeldic = set(map(lambda (a, b): a, allentids)).difference(set(labeldic.keys()))
    print len(entidsnotinlabeldic), list(entidsnotinlabeldic)[:20]
    wd = {v: k for k, v in x["worddic"].items()}
    ed = {v: k for k, v in x["entdic"].items()}
    alltrainents = set(x["train"][1][:, 0])
    allvalidents = set(x["valid"][1][:, 0])
    alltestents = set(x["test"][1][:, 0])
    print "%d/%d (%.2f%%) test set entities not in training " % (len(alltestents.difference(alltrainents)), len(alltestents), len(alltestents.difference(alltrainents))*100./len(alltestents))
    print "%d/%d (%.2f%%) validation set entities not in training " % (
        len(allvalidents.difference(alltrainents)), len(allvalidents),
        len(allvalidents.difference(alltrainents)) * 100. / len(allvalidents))
    #print len(allvalidents.difference(alltrainents))

    # gather all words in entity labels
    labelwords = {}
    labelwordsnotinglove = set()
    for label in labeldic.values():
        for labelw in tokenize(label):
            if labelw not in labelwords:
                labelwords[labelw] = 0
            labelwords[labelw] += 1
            if labelw not in glove:
                labelwordsnotinglove.add(labelw)

    print "%d unique words in labels" % len(labelwords)
    print "%d words not in glove" % len(labelwordsnotinglove)

    for split in ["train", "test", "valid"]:
        #print "SPLIT %s" % split
        split = x[split]
        #print len(split[0])
        wocount = 0
        for i in range(len(split[0])):
            #print ed[split[1][i][0]], ed[split[1][i][0]] in entidsnotinlabeldic
            #break
            if ed[split[1][i][0]] in entidsnotinlabeldic:
                #print " ".join(map(lambda x: wd[x] if x in wd else "", list(split[0][i]))), ed[split[1][i][0]]
                wocount += 1
            else:
                pass
示例#8
0
def getdata(p, chardic, entdic, reldic, maxc=np.infty):
    data = []
    gold = []
    maxlen = 0
    c = 0
    for line in open(p):
        q, a = (line[:-1] if line[-1] == "\n" else line).split("\t")
        s, p = a.split()
        words = tokenize(q)
        q = " ".join(words)
        maxlen = max(maxlen, len(q))
        chars = map(ord, q)
        if len(set(chars).intersection({123})) > 0:
            pass #print line, q
        if s not in entdic:
            entdic[s] = len(entdic)
        if p not in reldic:
            reldic[p] = len(reldic)
        data.append(chars)
        gold.append([entdic[s], reldic[p]])
        c += 1
        if c > maxc:
            break
    datamat = np.zeros((c, maxlen)).astype("int32") - 1
    goldmat = np.zeros((c, 2)).astype("int32")
    i = 0
    for x in data:
        datamat[i, :len(x)] = x
        i += 1
    i = 0
    for x in gold:
        goldmat[i, :] = x
        i += 1
    # making chardic and transforming through chardic
    thischardic = dict(map(lambda (x,y): (ord(x), y), chardic.items()))
    nextid = 0
    while nextid in thischardic.values():
        nextid += 1
    uniquechars = np.unique(datamat)
    for uniquechar in list(uniquechars):
        if not uniquechar in thischardic and uniquechar >= 0:
            thischardic[uniquechar] = nextid
            while nextid in thischardic.values():
                nextid += 1
    chardic.update(dict(map(lambda (x, y): (chr(x), y), thischardic.items())))
    print len(chardic), chardic
    datamat = np.vectorize(lambda x: thischardic[x] if x >= 0 else x)(datamat)
    return datamat, goldmat
示例#9
0
def getdata(p, chardic, entdic, reldic, maxc=np.infty):
    data = []
    gold = []
    maxlen = 0
    c = 0
    for line in open(p):
        q, a = (line[:-1] if line[-1] == "\n" else line).split("\t")
        s, p = a.split()
        words = tokenize(q)
        q = " ".join(words)
        maxlen = max(maxlen, len(q))
        chars = map(ord, q)
        if len(set(chars).intersection({123})) > 0:
            pass  #print line, q
        if s not in entdic:
            entdic[s] = len(entdic)
        if p not in reldic:
            reldic[p] = len(reldic)
        data.append(chars)
        gold.append([entdic[s], reldic[p]])
        c += 1
        if c > maxc:
            break
    datamat = np.zeros((c, maxlen)).astype("int32") - 1
    goldmat = np.zeros((c, 2)).astype("int32")
    i = 0
    for x in data:
        datamat[i, :len(x)] = x
        i += 1
    i = 0
    for x in gold:
        goldmat[i, :] = x
        i += 1
    # making chardic and transforming through chardic
    thischardic = dict(map(lambda (x, y): (ord(x), y), chardic.items()))
    nextid = 0
    while nextid in thischardic.values():
        nextid += 1
    uniquechars = np.unique(datamat)
    for uniquechar in list(uniquechars):
        if not uniquechar in thischardic and uniquechar >= 0:
            thischardic[uniquechar] = nextid
            while nextid in thischardic.values():
                nextid += 1
    chardic.update(dict(map(lambda (x, y): (chr(x), y), thischardic.items())))
    print len(chardic), chardic
    datamat = np.vectorize(lambda x: thischardic[x] if x >= 0 else x)(datamat)
    return datamat, goldmat
示例#10
0
def getmemdata(reldic,
               worddic,
               chardic,
               maxchar=30):  # updates worddic with words found in relation
    rels = sorted(reldic.items(), key=lambda (x, y): y)
    rels = map(lambda (x, y): (tokenize(x), y), rels)
    allrelwords = set()
    allrelchars = set()
    maxlen = 0
    maxwordlen = 0
    prevc = -1
    for rel, c in rels:
        assert (c - 1 == prevc)
        prevc = c
        maxlen = max(maxlen, len(rel))
        for relw in rel:
            allrelwords.add(relw)
            maxwordlen = max(maxwordlen, len(relw))
            for relwchar in relw:
                allrelchars.add(relwchar)
    maxchar = min(maxchar, maxwordlen)
    relwordsnotinworddic = allrelwords.difference(set(worddic.keys()))
    charsnotinchardic = allrelchars.difference(set(chardic.keys()))
    for rwniw in relwordsnotinworddic:
        worddic[rwniw] = len(worddic)
    nextid = 0
    for cnic in charsnotinchardic:
        while nextid in chardic.values():
            nextid += 1
        chardic[cnic] = nextid

    wordmat = np.zeros((len(rels), maxlen)).astype("int32") - 1
    charten = np.zeros((len(rels), maxlen, maxchar)).astype("int32") - 1
    for rel, c in rels:
        wordmat[c, :len(rel)] = map(lambda x: worddic[x], rel)
        j = 0
        for relw in rel:
            charten[c, j, :min(len(relw), maxchar)] = \
                map(lambda x: chardic[x], relw[:min(len(relw), maxchar)])
            j += 1
    datamat = np.concatenate([wordmat.reshape(wordmat.shape + (1, )), charten],
                             axis=2)
    return datamat
示例#11
0
def getmemdata(reldic, worddic, chardic, maxchar=30):    # updates worddic with words found in relation
    rels = sorted(reldic.items(), key=lambda (x, y): y)
    rels = map(lambda (x, y): (tokenize(x), y), rels)
    allrelwords = set()
    allrelchars = set()
    maxlen = 0
    maxwordlen = 0
    prevc = -1
    for rel, c in rels:
        assert(c-1 == prevc)
        prevc = c
        maxlen = max(maxlen, len(rel))
        for relw in rel:
            allrelwords.add(relw)
            maxwordlen = max(maxwordlen, len(relw))
            for relwchar in relw:
                allrelchars.add(relwchar)
    maxchar = min(maxchar, maxwordlen)
    relwordsnotinworddic = allrelwords.difference(set(worddic.keys()))
    charsnotinchardic = allrelchars.difference(set(chardic.keys()))
    for rwniw in relwordsnotinworddic:
        worddic[rwniw] = len(worddic)
    nextid = 0
    for cnic in charsnotinchardic:
        while nextid in chardic.values():
            nextid += 1
        chardic[cnic] = nextid

    wordmat = np.zeros((len(rels), maxlen)).astype("int32") - 1
    charten = np.zeros((len(rels), maxlen, maxchar)).astype("int32") - 1
    for rel, c in rels:
        wordmat[c, :len(rel)] = map(lambda x: worddic[x], rel)
        j = 0
        for relw in rel:
            charten[c, j, :min(len(relw), maxchar)] = \
                map(lambda x: chardic[x], relw[:min(len(relw), maxchar)])
            j += 1
    datamat = np.concatenate([wordmat.reshape(wordmat.shape + (1,)),
                              charten], axis=2)
    return datamat
示例#12
0
 def getdata(p, maxc=np.infty):
     data = []
     gold = []
     maxlen = 0
     c = 0
     for line in open(p):
         q, a = (line[:-1] if line[-1] == "\n" else line).split("\t")
         s, p = a.split()
         words = tokenize(q)
         updatedics(*words)
         maxlen = max(maxlen, len(words))
         if s not in entdic:
             updateentk(s)
         if p not in reldic:
             raise Exception("impossibru!")
         wordidx = map(lambda x: worddic[x] if x in worddic else worddic["<RARE>"], words)
         charsidx = [[chardic[x] if x in chardic else chardic[" "]
                     for x in word] for word in words]
         data.append((wordidx, charsidx))
         gold.append([entdic[s], reldic[p]])
         c += 1
         if c % 100 == 0:
             print c
         if c > maxc:
             break
     datamat = np.zeros((c, maxlen, maxnamewordlen+1)).astype("int32") - 1
     goldmat = np.zeros((c, 2)).astype("int32")
     i = 0
     for x, y in data:
         datamat[i, :len(x), 0] = x
         j = 0
         for ye in y:
             xec =
             datamat[i, j, 1:maxnamewordlen+1]
         i += 1
     i = 0
     for x in gold:
         goldmat[i, :] = x
         i += 1
     return datamat, goldmat
示例#13
0
def getcharmemdata(reldic, chardic):
    rels = sorted(reldic.items(), key=lambda (x, y): y)
    rels = map(lambda (x, y): (" ".join(tokenize(x)), y), rels)
    maxlen = 0
    prevc = -1
    allrelchars = set()
    for rel, c in rels:
        assert(c-1 == prevc)
        prevc = c
        maxlen = max(maxlen, len(rel))
        for relchar in rel:
            allrelchars.add(relchar)
    charsnotinchardic = allrelchars.difference(set(chardic.keys()))
    nextid = 0
    for cnic in charsnotinchardic:
        while nextid in chardic.values():
            nextid += 1
        chardic[cnic] = nextid
    retmat = np.zeros((len(rels), maxlen)).astype("int32") - 1
    for rel, k in rels:
        rel = [chardic[c] for c in rel]
        retmat[k, :len(rel)] = rel
    return retmat
示例#14
0
 def readdataset(p, wdic, maxlen=100):
     dataret = []
     goldret = []
     toolong = 0
     realmaxlen = 0
     with open(p) as f:
         data = csv.reader(f, delimiter=",")
         for row in data:
             rowelems = tokenize(row[2])
             realmaxlen = max(realmaxlen, len(rowelems))
             if len(rowelems) > maxlen:
                 toolong += 1
             for rowelem in set(rowelems):
                 if rowelem not in wdic:
                     wdic[rowelem] = len(wdic)
             dataret.append([wdic[x] for x in rowelems])
             goldret.append(row[0])
     print "{} comments were too long".format(toolong)
     maxlen = min(maxlen, realmaxlen)
     datamat = np.ones((len(dataret) - 1, maxlen)).astype("int32") * masksym
     for i in range(1, len(dataret)):
         datamat[i - 1, :min(len(dataret[i]), maxlen)] = dataret[i][:min(len(dataret[i]), maxlen)]
     return datamat, np.asarray(goldret[1:], dtype="int32"), wdic
示例#15
0
def run(trainp="../fb_train.tsv",
        testp="../fb_test.tsv",
        validp="../fb_valid.tsv",
        outp="datamat.word.fb2m.pkl",
        entnames="../subjnames_fb2m.map",
        rellist="../predicates_fb2m.list",
        maxnamelen=30):
    # worddic
    worddic = {"<RARE>": 0}
    wordcounts = {"<RARE>": 0}
    def addwords(*words):       # adds a word to the worddic
        for word in words:
            if word not in worddic:
                worddic[word] = len(worddic)
            if word not in wordcounts:
                wordcounts[word] = 0
            wordcounts[word] += 1

    # entity names
    entdic = {}
    entmatr = []
    c = 0       # counter
    maxlen = 0
    for line in open(entnames):
        entity_id, entity_label = line[:-1].split("\t")
        entity_label_tokens = tokenize(entity_label)
        entity_label_tokens = entity_label_tokens[:min(len(entity_label_tokens), maxnamelen)]    # ensure taken entity label has no more than maximum allowed number of tokens
        maxlen = max(maxlen, len(entity_label_tokens))
        addwords(*entity_label_tokens)
        entmatr.append(entity_label_tokens)
        if entity_id not in entdic:
            entdic[entity_id] = len(entdic)
        if c % 1e3 == 0:
            print("{}k".format(c/1e3))
        c += 1

    entswonames = set()
    def add_entity_wo_label(entid):
        assert(entid not in entdic)
        entdic[entid] = len(entdic)
        entswonames.add(entid)
        entmatr.append(["<RARE>"])

    # relation uri's
    reldic = {}
    relmatr = []
    for line in open(rellist):
        relation_uri = line[:-1]
        relation_uri_tokens = reluri_tokenize(relation_uri)
        relation_uri_tokens = relation_uri_tokens[:min(len(relation_uri_tokens), maxnamelen)]   # ensure max len
        maxlen = max(maxlen, len(relation_uri_tokens))
        addwords(*relation_uri_tokens)
        relmatr.append(relation_uri_tokens)
        if relation_uri not in reldic:
            reldic[relation_uri] = len(reldic)

    maxnamelen = min(maxlen, maxnamelen)

    print len(entdic), len(reldic), len(worddic), maxnamelen

    def getdata(p, maxc=np.infty):
        data = []
        gold = []
        maxlen = 0
        c = 0
        for line in open(p):
            question, answer = line[:-1].split("\t")
            subject, predicate = answer.split(" ")
            question_words = tokenize(question)
            addwords(*question_words)
            maxlen = max(maxlen, len(question_words))
            if subject not in entdic:
                add_entity_wo_label(subject)
            if predicate not in reldic:
                raise Exception("predicate should be there")
            wordidx = [worddic[x] if x in worddic else worddic["<RARE>"] for x in question_words]
            data.append(wordidx)
            gold.append([entdic[subject], reldic[predicate]])
            c += 1
            if c % 100 == 0:
                print c
            if c > maxc:
                break
        datamat = np.zeros((c, maxlen)).astype("int32") - 1
        goldmat = np.zeros((c, 2)).astype("int32")
        i = 0
        for x in data:
            datamat[i, :len(x)] = x
            i += 1
        i = 0
        for x in gold:
            goldmat[i, :] = x
            i += 1
        return datamat, goldmat

    traindata = getdata(trainp)
    validdata = getdata(validp)
    testdata = getdata(testp)

    # build ent mat
    entmat = np.zeros((len(entmatr), maxnamelen), dtype="int32") - 1
    for i in range(len(entmatr)):
        x = entmatr[i]
        entmat[i, :len(x)] = [worddic[a] for a in x]
    # build rel mat
    relmat = np.zeros((len(relmatr), maxnamelen), dtype="int32") - 1
    for i in range(len(relmatr)):
        x = relmatr[i]
        relmat[i, :len(x)] = [worddic[a] for a in x]

    # package
    entmat = np.concatenate([entmat, relmat], axis=0)
    numents = len(entdic)
    traindata[1][:, 1] += numents
    validdata[1][:, 1] += numents
    testdata[1][:, 1] += numents
    entdic.update({k: v+numents for k, v in reldic.items()})

    # save
    acc = {
        "train": traindata,
        "valid": validdata,
        "test": testdata,
        "worddic": worddic,
        "entdic": entdic,
        "entmat": entmat,
        "numents": numents,
        "wordcounts": wordcounts,
    }

    print("{} entities without names in datasets".format(len(entswonames)))

    pickle.dump(acc, open(outp, "w"))
示例#16
0
class SimpleQuestionsLabelIndex(object):
    def __init__(self, host="drogon", index="simplequestions_labels"):
        self.host = host
        self.indexp = index

    def index(self, labelp="labels.map"):
        es = elasticsearch.Elasticsearch(hosts=[self.host])
        try:
            es.indices.delete(index=self.indexp)
        except Exception, e:
            print "could not delete index %s" % self.indexp
        es.indices.create(
            index=self.indexp,
            body={
                "settings": {
                    "index": {
                        "analysis": {
                            "filter": {
                                "pstemmer": {
                                    "type": "porter_stem",
                                    "language": "_english_"
                                }
                            },
                            "char_filter": {
                                "punctfil": {
                                    "type": "pattern_replace",
                                    "pattern": "[\W]",
                                    "replacement": ""
                                }
                            },
                            "analyzer": {
                                "myana": {
                                    "tokenizer": "whitespace",
                                    "filter": [
                                        "lowercase",
                                        "pstemmer",
                                    ]
                                    #,"char_filter": ["punctfil"]
                                }
                            }
                        }
                    }
                },
                "mappings": {
                    "labelmap": {
                        "properties": {
                            "label": {
                                "type": "string",
                                "analyzer": "myana",
                                "fields": {
                                    "len": {
                                        "type": "token_count",
                                        "store": "yes",
                                        "analyzer": "myana"
                                    }
                                }
                            }
                        }
                    }
                }
            })
        i = 1
        for line in open(labelp):
            k, v = line[:-1].split("\t")
            vt = tokenize(v)
            es.index(index=self.indexp,
                     doc_type="labelmap",
                     id=i,
                     body={
                         "label": " ".join(vt),
                         "fbid": k
                     })
            if i % 1000 == 0:
                print i
            i += 1
        print "indexed labels"
示例#17
0
 def searchsentence(self, s, top=None, topsize=None, exact=True):
     s = tokenize(s)
     ngrams = self.getallngrams(s, topsize)
     return self.searchallngrams(ngrams, top, exact=exact)
示例#18
0
 def __call__(self, x):
     ret = tokenize(self.labelfy(x[0]))
     ret = ret[: min(self.maxlen, len(ret))]
     return ret, x[1]
示例#19
0
 def __call__(self, x):
     ret = tokenize(self.labelfy(x[0]))
     ret = ret[:min(self.maxlen, len(ret))]
     return ret, x[1]
示例#20
0
def run(trainp="fb_train.tsv",
        testp="fb_test.tsv",
        validp="fb_valid.tsv",
        outp="datamat.char.mem.fb2m.pkl",
        entnames="subjnames_fb2m.map",
        rellist="rels_fb2m.list",
        maxnamelen=100):
    # worddic
    chardic = {" ": 0}
    def updatechardic(*chars):
        for char in chars:
            if char not in chardic:
                chardic[char] = len(chardic)

    # process entity names and relation list
    entdic = {}
    entmatr = []
    entswonames = set()
    c = 0
    maxlen = 0
    for line in open(entnames):
        e, n = line.split("\t")
        nt = " ".join(tokenize(n))
        nt = nt[:min(len(nt), maxnamelen)]
        maxlen = max(maxlen, len(nt))
        updatechardic(*set(nt))
        entmatr.append(nt)
        if e not in entdic:
            entdic[e] = len(entdic)
        if c % 1e3 == 0:
            print "%.0fk" % (c/1e3)
        c += 1

    def updateentk(*ents):  #ents have not been seen during initial population ==> no titles
        for ent in ents:
            assert(ent not in entdic)
            entdic[ent] = len(entdic)
            entswonames.add(ent)
            entmatr.append([" "])

    reldic = {}
    relmatr = []
    for line in open(rellist):
        r = line[:-1]
        rt = " ".join(tokenize(r))
        rt = rt[:min(len(rt), maxnamelen)]
        maxlen = max(maxlen, len(rt))
        updatechardic(*set(rt))
        relmatr.append(rt)
        r = "/" + r.replace(".", "/")
        if r not in reldic:
            reldic[r] = len(reldic)

    maxnamelen = min(maxlen, maxnamelen)

    print len(entdic), len(reldic), len(chardic), maxnamelen

    # process data
    def getdata(p, maxc=np.infty):
        data = []
        gold = []
        maxlen = 0
        c = 0
        for line in open(p):
            q, a = (line[:-1] if line[-1] == "\n" else line).split("\t")
            s, p = a.split()
            chars = " ".join(tokenize(q))
            updatechardic(*set(chars))
            maxlen = max(maxlen, len(chars))
            if s not in entdic:
                updateentk(s)
            if p not in reldic:
                raise Exception("impossibru!")
            charidx = map(lambda x: chardic[x] if x in chardic else chardic[" "], chars)
            data.append(charidx)
            gold.append([entdic[s], reldic[p]])
            c += 1
            if c % 100 == 0:
                print c
            if c > maxc:
                break
        datamat = np.zeros((c, maxlen)).astype("int32") - 1
        goldmat = np.zeros((c, 2)).astype("int32")
        i = 0
        for x in data:
            datamat[i, :len(x)] = x
            i += 1
        i = 0
        for x in gold:
            goldmat[i, :] = x
            i += 1
        return datamat, goldmat

    traindata = getdata(trainp)
    validdata = getdata(validp)
    testdata = getdata(testp)

    # build ent mat
    entmat = np.zeros((len(entmatr), maxnamelen), dtype="int32") - 1
    for i in range(len(entmatr)):
        x = entmatr[i]
        entmat[i, :len(x)] = map(lambda a: chardic[a], x)
    # build rel mat
    relmat = np.zeros((len(relmatr), maxnamelen), dtype="int32") - 1
    for i in range(len(relmatr)):
        x = relmatr[i]
        relmat[i, :len(x)] = map(lambda a: chardic[a], x)

    # pre-package tests:
    print entmat.shape[0], len(entdic)
    assert(entmat.shape[0] == len(entdic))
    # package
    entmat = np.concatenate([entmat, relmat], axis=0)
    numents = len(entdic)
    traindata[1][:, 1] += numents
    validdata[1][:, 1] += numents
    testdata[1][:, 1] += numents
    reldic = {k: v+numents for k, v in reldic.items()}
    entdic.update(reldic)

    # save
    acc = {
        "train": traindata,
        "valid": validdata,
        "test":  testdata,
        "chardic": chardic,
        "entdic": entdic,
        "entmat": entmat,
        "numents": numents
    }

    print "%d entities without names in datasets" % len(entswonames)

    pickle.dump(acc, open(outp, "w"))
示例#21
0
    def predict(self,
                data,
                entcans=None,
                relsperent=None,
                relcans=None,
                multiprune=-1):
        print multiprune
        assert (relsperent is None or relcans is None)
        assert (relsperent is not None or relcans is not None)
        assert (entcans is not None)
        self.encodequestions(data)
        rankedsubjs = self.ranksubjects(entcans)
        bestsubjs = [x[0][0] for x in rankedsubjs]
        if relcans is not None:
            rankedrels = self.rankrelations(relcans)
            bestrels = [x[0][0] for x in rankedrels]
        else:
            if multiprune <= 0:
                relcans = [
                    relsperent[bestsubj][0] if bestsubj in relsperent else []
                    for bestsubj in bestsubjs
                ]
                rankedrels = self.rankrelations(relcans)
                bestrels = [x[0][0] for x in rankedrels]
            else:
                print "multipruning !!!!!!!!!!!!!!!!!"
                topk = multiprune  # TOP K !!!!!!!!!!!!!!!!!!!!!!!!!!!!
                # get relcans
                relcans = []
                for subjranking in rankedsubjs:
                    toplabel = None
                    relcanse = []
                    i = 0
                    for subj, score in subjranking:
                        subjlabel = " ".join(
                            tokenize(self.subjinfo[subj][0]) if subj in
                            self.subjinfo else [])
                        topcan = None
                        if toplabel is None:
                            toplabel = subjlabel
                            topcan = subj
                        elif subjlabel == toplabel:
                            topcan = subj
                        elif i < topk:
                            topcan = subj
                        else:
                            pass
                        toadd = relsperent[topcan][
                            0] if topcan in relsperent else []
                        relcanse.extend(toadd)
                        i += 1
                    relcans.append(relcanse)
                # rank relations
                rankedrels = self.rankrelations(relcans)
                bestrels = [x[0][0] for x in rankedrels]
                # build ents per relation
                entsperrel = {}
                for ent, rels in relsperent.items():
                    for rel in rels[0]:
                        if rel not in entsperrel:
                            entsperrel[rel] = set()
                        entsperrel[rel].add(ent)
                # filter rankedsubjs
                filteredrankedsubjs = []
                for i in range(len(rankedsubjs)):
                    filteredrankedsubjs.append([])
                    for subj, score in rankedsubjs[i]:
                        if bestrels[i] in entsperrel and \
                                        subj in entsperrel[bestrels[i]]:
                            filteredrankedsubjs[i].append((subj, score))
                    if len(filteredrankedsubjs[i]) == 0:
                        filteredrankedsubjs[i].append((-1, -1.))
                bestsubjs = [x[0][0] for x in filteredrankedsubjs]

        ret = np.concatenate([
            np.expand_dims(np.asarray(bestsubjs, dtype="int32"), axis=1),
            np.expand_dims(np.asarray(bestrels, dtype="int32"), axis=1)
        ],
                             axis=1)
        return ret
示例#22
0
 def searchsentence(self, s, top=None, topsize=None, exact=True):
     s = tokenize(s)
     ngrams = self.getallngrams(s, topsize)
     return self.searchallngrams(ngrams, top, exact=exact)
示例#23
0
def getwords(s):
    return tokenize(s)
示例#24
0
def getwords(s):
    return tokenize(s)
示例#25
0
def getdata(p, worddic, chardic, entdic, reldic, maxc=np.infty, maxchar=30):
    data = []
    gold = []
    maxlen = 0
    maxwordlen = 0
    c = 0
    for line in open(p):
        q, a = (line[:-1] if line[-1] == "\n" else line).split("\t")
        s, p = a.split()
        words = tokenize(q)
        maxlen = max(maxlen, len(words))
        for word in words:
            maxwordlen = max(maxwordlen, len(word))
            if word not in worddic:
                worddic[word] = len(worddic)
        if s not in entdic:
            entdic[s] = len(entdic)
        if p not in reldic:
            reldic[p] = len(reldic)
        data.append(words)
        gold.append([entdic[s], reldic[p]])
        c += 1
        if c > maxc:
            break
    print maxwordlen
    maxchar = min(maxchar, maxwordlen)
    wordmat = np.zeros((c, maxlen)).astype("int32") - 1
    charten = np.zeros((c, maxlen, maxchar)).astype("int32") - 1
    goldmat = np.zeros((c, 2)).astype("int32")
    i = 0
    for sent in data:
        j = 0
        for word in sent:
            if len(word) > maxchar:
                print word
            wordmat[i, j] = worddic[word]
            chars = map(ord, word)
            charten[i, j, :min(len(chars), maxchar
                               )] = chars[:min(len(chars), maxchar)]
            j += 1
        i += 1
    i = 0
    for x in gold:
        goldmat[i, :] = x
        i += 1
    # making chardic and transforming through chardic
    thischardic = dict(map(lambda (x, y): (ord(x), y), chardic.items()))
    nextid = 0
    while nextid in thischardic.values():
        nextid += 1
    uniquechars = np.unique(charten)
    for uniquechar in list(uniquechars):
        if not uniquechar in thischardic and uniquechar >= 0:
            thischardic[uniquechar] = nextid
            while nextid in thischardic.values():
                nextid += 1
    chardic.update(dict(map(lambda (x, y): (chr(x), y), thischardic.items())))
    print len(chardic), chardic
    charten = np.vectorize(lambda x: thischardic[x] if x >= 0 else x)(charten)
    datamat = np.concatenate([wordmat.reshape(wordmat.shape + (1, )), charten],
                             axis=2)
    return datamat, goldmat
示例#26
0
def run(trainp="fb_train.tsv",
        testp="fb_test.tsv",
        validp="fb_valid.tsv",
        outp="datamat.word.mem.fb2m.pkl",
        entnames="subjnames_fb2m.map",
        rellist="rels_fb2m.list",
        maxnamelen=30):
    # worddic
    worddic = {"<RARE>": 0}

    def updateworddic(*words):
        for word in words:
            if word not in worddic:
                worddic[word] = len(worddic)

    # process entity names and relation list
    entdic = {}
    entmatr = []
    entswonames = set()
    c = 0
    maxlen = 0
    for line in open(entnames):
        e, n = line.split("\t")
        nt = tokenize(n)
        nt = nt[:min(len(nt), maxnamelen)]
        maxlen = max(maxlen, len(nt))
        updateworddic(*nt)
        entmatr.append(nt)
        if e not in entdic:
            entdic[e] = len(entdic)
        if c % 1e3 == 0:
            print "%.0fk" % (c / 1e3)
        c += 1

    def updateentk(
        *ents
    ):  #ents have not been seen during initial population ==> no titles
        for ent in ents:
            assert (ent not in entdic)
            entdic[ent] = len(entdic)
            entswonames.add(ent)
            entmatr.append(["<RARE>"])

    reldic = {}
    relmatr = []
    for line in open(rellist):
        r = line[:-1]
        rt = tokenize(r)
        rt = rt[:min(len(rt), maxnamelen)]
        maxlen = max(maxlen, len(rt))
        updateworddic(*rt)
        relmatr.append(rt)
        r = "/" + r.replace(".", "/")
        if r not in reldic:
            reldic[r] = len(reldic)

    maxnamelen = min(maxlen, maxnamelen)

    print len(entdic), len(reldic), len(worddic), maxnamelen

    # process data
    def getdata(p, maxc=np.infty):
        data = []
        gold = []
        maxlen = 0
        c = 0
        for line in open(p):
            q, a = (line[:-1] if line[-1] == "\n" else line).split("\t")
            s, p = a.split()
            words = tokenize(q)
            updateworddic(*words)
            maxlen = max(maxlen, len(words))
            if s not in entdic:
                updateentk(s)
            if p not in reldic:
                raise Exception("impossibru!")
            wordidx = map(
                lambda x: worddic[x]
                if x in worddic else worddic["<RARE>"], words)
            data.append(wordidx)
            gold.append([entdic[s], reldic[p]])
            c += 1
            if c % 100 == 0:
                print c
            if c > maxc:
                break
        datamat = np.zeros((c, maxlen)).astype("int32") - 1
        goldmat = np.zeros((c, 2)).astype("int32")
        i = 0
        for x in data:
            datamat[i, :len(x)] = x
            i += 1
        i = 0
        for x in gold:
            goldmat[i, :] = x
            i += 1
        return datamat, goldmat

    traindata = getdata(trainp)
    validdata = getdata(validp)
    testdata = getdata(testp)

    # build ent mat
    entmat = np.zeros((len(entmatr), maxnamelen), dtype="int32") - 1
    for i in range(len(entmatr)):
        x = entmatr[i]
        entmat[i, :len(x)] = map(lambda a: worddic[a], x)
    # build rel mat
    relmat = np.zeros((len(relmatr), maxnamelen), dtype="int32") - 1
    for i in range(len(relmatr)):
        x = relmatr[i]
        relmat[i, :len(x)] = map(lambda a: worddic[a], x)

    # pre-package tests:
    print entmat.shape[0], len(entdic)
    assert (entmat.shape[0] == len(entdic))
    # package
    entmat = np.concatenate([entmat, relmat], axis=0)
    numents = len(entdic)
    traindata[1][:, 1] += numents
    validdata[1][:, 1] += numents
    testdata[1][:, 1] += numents
    reldic = {k: v + numents for k, v in reldic.items()}
    entdic.update(reldic)

    # save
    acc = {
        "train": traindata,
        "valid": validdata,
        "test": testdata,
        "worddic": worddic,
        "entdic": entdic,
        "entmat": entmat,
        "numents": numents
    }

    print "%d entities without names in datasets" % len(entswonames)

    pickle.dump(acc, open(outp, "w"))
示例#27
0
 def processline(self, x):
     return " ".join(tokenize(x))
示例#28
0
def reluri_tokenize(reluri):
    return tokenize(reluri.replace("/", " ").replace("_", " "))