Exemplo n.º 1
 def build(self, p):
     i = 0
     tt = ticktock("builder")
     for line in open(p):
         sline = line[:-1].split("\t")
         fb_id = sline[0]
         triplecount = int(sline[1]) + int(sline[2])
         name = self.processor.processline(sline[3])
         type_id = sline[4]
         type_id = type_id if type_id != "<UNK>" else None
         type_name = " ".join(tokenize(sline[5]))
         type_name = type_name if type_name != " ".join(
             tokenize("<UNK>")) else None
         if name not in self.indexdict:
             self.indexdict[name] = []
             "fb_id": fb_id,
             "triplecount": triplecount,
             "type_id": type_id,
             "type_name": type_name
         i += 1
         if i % 1000 == 0:
             tt.live("{}k".format(i // 1000))
Exemplo n.º 2
def getdata(p, worddic, chardic, entdic, reldic, maxc=np.infty, maxchar=30):
    data = []
    gold = []
    maxlen = 0
    maxwordlen = 0
    c = 0
    for line in open(p):
        q, a = (line[:-1] if line[-1] == "\n" else line).split("\t")
        s, p = a.split()
        words = tokenize(q)
        maxlen = max(maxlen, len(words))
        for word in words:
            maxwordlen = max(maxwordlen, len(word))
            if word not in worddic:
                worddic[word] = len(worddic)
        if s not in entdic:
            entdic[s] = len(entdic)
        if p not in reldic:
            reldic[p] = len(reldic)
        gold.append([entdic[s], reldic[p]])
        c += 1
        if c > maxc:
    print maxwordlen
    maxchar = min(maxchar, maxwordlen)
    wordmat = np.zeros((c, maxlen)).astype("int32") - 1
    charten = np.zeros((c, maxlen, maxchar)).astype("int32") - 1
    goldmat = np.zeros((c, 2)).astype("int32")
    i = 0
    for sent in data:
        j = 0
        for word in sent:
            if len(word) > maxchar:
                print word
            wordmat[i, j] = worddic[word]
            chars = map(ord, word)
            charten[i, j, :min(len(chars), maxchar)] = chars[:min(len(chars), maxchar)]
            j += 1
        i += 1
    i = 0
    for x in gold:
        goldmat[i, :] = x
        i += 1
    # making chardic and transforming through chardic
    thischardic = dict(map(lambda (x,y): (ord(x), y), chardic.items()))
    nextid = 0
    while nextid in thischardic.values():
        nextid += 1
    uniquechars = np.unique(charten)
    for uniquechar in list(uniquechars):
        if not uniquechar in thischardic and uniquechar >= 0:
            thischardic[uniquechar] = nextid
            while nextid in thischardic.values():
                nextid += 1
    chardic.update(dict(map(lambda (x, y): (chr(x), y), thischardic.items())))
    print len(chardic), chardic
    charten = np.vectorize(lambda x: thischardic[x] if x >= 0 else x)(charten)
    datamat = np.concatenate([wordmat.reshape(wordmat.shape + (1,)), charten], axis=2)
    return datamat, goldmat
Exemplo n.º 3
 def getdata(p, maxc=np.infty):
     data = []
     gold = []
     maxlen = 0
     c = 0
     for line in open(p):
         question, answer = line[:-1].split("\t")
         subject, predicate = answer.split(" ")
         question_words = tokenize(question)
         maxlen = max(maxlen, len(question_words))
         if subject not in entdic:
         if predicate not in reldic:
             raise Exception("predicate should be there")
         wordidx = [worddic[x] if x in worddic else worddic["<RARE>"] for x in question_words]
         gold.append([entdic[subject], reldic[predicate]])
         c += 1
         if c % 100 == 0:
             print c
         if c > maxc:
     datamat = np.zeros((c, maxlen)).astype("int32") - 1
     goldmat = np.zeros((c, 2)).astype("int32")
     i = 0
     for x in data:
         datamat[i, :len(x)] = x
         i += 1
     i = 0
     for x in gold:
         goldmat[i, :] = x
         i += 1
     return datamat, goldmat
Exemplo n.º 4
 def getdata(p, maxc=np.infty):
     data = []
     gold = []
     maxlen = 0
     c = 0
     for line in open(p):
         q, a = (line[:-1] if line[-1] == "\n" else line).split("\t")
         s, p = a.split()
         chars = " ".join(tokenize(q))
         maxlen = max(maxlen, len(chars))
         if s not in entdic:
         if p not in reldic:
             raise Exception("impossibru!")
         charidx = map(lambda x: chardic[x] if x in chardic else chardic[" "], chars)
         gold.append([entdic[s], reldic[p]])
         c += 1
         if c % 100 == 0:
             print c
         if c > maxc:
     datamat = np.zeros((c, maxlen)).astype("int32") - 1
     goldmat = np.zeros((c, 2)).astype("int32")
     i = 0
     for x in data:
         datamat[i, :len(x)] = x
         i += 1
     i = 0
     for x in gold:
         goldmat[i, :] = x
         i += 1
     return datamat, goldmat
Exemplo n.º 5
 def stemmedprocessline(self, x):
     x = x.replace("'s", "")
     x = x.replace("' s", "")
     x = x.replace("'", "")
     tokens = tokenize(x)
     #print tokens
     stokens = [self.stemmer.stem(token) for token in tokens]
     return " ".join(stokens)
Exemplo n.º 6
def run(labelp="labels.map", datap="datamat.word.pkl"):
    labeldic = loadlabels(labelp)
    glove = Glove(50)
    print "the" in glove
    x = pickle.load(open(datap))
    print len(x["train"][0])
    # get entities without labels
    allentids = sorted(x["entdic"].items(), key=lambda
                       (a, b): b)[:x["numents"]]
    print allentids[:20]
    entidsnotinlabeldic = set(map(lambda (a, b): a,
    print len(entidsnotinlabeldic), list(entidsnotinlabeldic)[:20]
    wd = {v: k for k, v in x["worddic"].items()}
    ed = {v: k for k, v in x["entdic"].items()}
    alltrainents = set(x["train"][1][:, 0])
    allvalidents = set(x["valid"][1][:, 0])
    alltestents = set(x["test"][1][:, 0])
    print "%d/%d (%.2f%%) test set entities not in training " % (
        len(alltestents.difference(alltrainents)), len(alltestents),
        len(alltestents.difference(alltrainents)) * 100. / len(alltestents))
    print "%d/%d (%.2f%%) validation set entities not in training " % (
        len(allvalidents.difference(alltrainents)), len(allvalidents),
        len(allvalidents.difference(alltrainents)) * 100. / len(allvalidents))
    #print len(allvalidents.difference(alltrainents))

    # gather all words in entity labels
    labelwords = {}
    labelwordsnotinglove = set()
    for label in labeldic.values():
        for labelw in tokenize(label):
            if labelw not in labelwords:
                labelwords[labelw] = 0
            labelwords[labelw] += 1
            if labelw not in glove:

    print "%d unique words in labels" % len(labelwords)
    print "%d words not in glove" % len(labelwordsnotinglove)

    for split in ["train", "test", "valid"]:
        #print "SPLIT %s" % split
        split = x[split]
        #print len(split[0])
        wocount = 0
        for i in range(len(split[0])):
            #print ed[split[1][i][0]], ed[split[1][i][0]] in entidsnotinlabeldic
            if ed[split[1][i][0]] in entidsnotinlabeldic:
                #print " ".join(map(lambda x: wd[x] if x in wd else "", list(split[0][i]))), ed[split[1][i][0]]
                wocount += 1
Exemplo n.º 7
Exemplo n.º 8
def getdata(p, chardic, entdic, reldic, maxc=np.infty):
    data = []
    gold = []
    maxlen = 0
    c = 0
    for line in open(p):
        q, a = (line[:-1] if line[-1] == "\n" else line).split("\t")
        s, p = a.split()
        words = tokenize(q)
        q = " ".join(words)
        maxlen = max(maxlen, len(q))
        chars = map(ord, q)
        if len(set(chars).intersection({123})) > 0:
            pass #print line, q
        if s not in entdic:
            entdic[s] = len(entdic)
        if p not in reldic:
            reldic[p] = len(reldic)
        gold.append([entdic[s], reldic[p]])
        c += 1
        if c > maxc:
    datamat = np.zeros((c, maxlen)).astype("int32") - 1
    goldmat = np.zeros((c, 2)).astype("int32")
    i = 0
    for x in data:
        datamat[i, :len(x)] = x
        i += 1
    i = 0
    for x in gold:
        goldmat[i, :] = x
        i += 1
    # making chardic and transforming through chardic
    thischardic = dict(map(lambda (x,y): (ord(x), y), chardic.items()))
    nextid = 0
    while nextid in thischardic.values():
        nextid += 1
    uniquechars = np.unique(datamat)
    for uniquechar in list(uniquechars):
        if not uniquechar in thischardic and uniquechar >= 0:
            thischardic[uniquechar] = nextid
            while nextid in thischardic.values():
                nextid += 1
    chardic.update(dict(map(lambda (x, y): (chr(x), y), thischardic.items())))
    print len(chardic), chardic
    datamat = np.vectorize(lambda x: thischardic[x] if x >= 0 else x)(datamat)
    return datamat, goldmat
Exemplo n.º 9
Exemplo n.º 10
def getmemdata(reldic,
               maxchar=30):  # updates worddic with words found in relation
    rels = sorted(reldic.items(), key=lambda (x, y): y)
    rels = map(lambda (x, y): (tokenize(x), y), rels)
    allrelwords = set()
    allrelchars = set()
    maxlen = 0
    maxwordlen = 0
    prevc = -1
    for rel, c in rels:
        assert (c - 1 == prevc)
        prevc = c
        maxlen = max(maxlen, len(rel))
        for relw in rel:
            maxwordlen = max(maxwordlen, len(relw))
            for relwchar in relw:
    maxchar = min(maxchar, maxwordlen)
    relwordsnotinworddic = allrelwords.difference(set(worddic.keys()))
    charsnotinchardic = allrelchars.difference(set(chardic.keys()))
    for rwniw in relwordsnotinworddic:
        worddic[rwniw] = len(worddic)
    nextid = 0
    for cnic in charsnotinchardic:
        while nextid in chardic.values():
            nextid += 1
        chardic[cnic] = nextid

    wordmat = np.zeros((len(rels), maxlen)).astype("int32") - 1
    charten = np.zeros((len(rels), maxlen, maxchar)).astype("int32") - 1
    for rel, c in rels:
        wordmat[c, :len(rel)] = map(lambda x: worddic[x], rel)
        j = 0
        for relw in rel:
            charten[c, j, :min(len(relw), maxchar)] = \
                map(lambda x: chardic[x], relw[:min(len(relw), maxchar)])
            j += 1
    datamat = np.concatenate([wordmat.reshape(wordmat.shape + (1, )), charten],
    return datamat
Exemplo n.º 11
Exemplo n.º 12
 def getdata(p, maxc=np.infty):
     data = []
     gold = []
     maxlen = 0
     c = 0
     for line in open(p):
         q, a = (line[:-1] if line[-1] == "\n" else line).split("\t")
         s, p = a.split()
         words = tokenize(q)
         maxlen = max(maxlen, len(words))
         if s not in entdic:
         if p not in reldic:
             raise Exception("impossibru!")
         wordidx = map(lambda x: worddic[x] if x in worddic else worddic["<RARE>"], words)
         charsidx = [[chardic[x] if x in chardic else chardic[" "]
                     for x in word] for word in words]
         data.append((wordidx, charsidx))
         gold.append([entdic[s], reldic[p]])
         c += 1
         if c % 100 == 0:
             print c
         if c > maxc:
     datamat = np.zeros((c, maxlen, maxnamewordlen+1)).astype("int32") - 1
     goldmat = np.zeros((c, 2)).astype("int32")
     i = 0
     for x, y in data:
         datamat[i, :len(x), 0] = x
         j = 0
         for ye in y:
             xec =
             datamat[i, j, 1:maxnamewordlen+1]
         i += 1
     i = 0
     for x in gold:
         goldmat[i, :] = x
         i += 1
     return datamat, goldmat
Exemplo n.º 13
def getcharmemdata(reldic, chardic):
    rels = sorted(reldic.items(), key=lambda (x, y): y)
    rels = map(lambda (x, y): (" ".join(tokenize(x)), y), rels)
    maxlen = 0
    prevc = -1
    allrelchars = set()
    for rel, c in rels:
        assert(c-1 == prevc)
        prevc = c
        maxlen = max(maxlen, len(rel))
        for relchar in rel:
    charsnotinchardic = allrelchars.difference(set(chardic.keys()))
    nextid = 0
    for cnic in charsnotinchardic:
        while nextid in chardic.values():
            nextid += 1
        chardic[cnic] = nextid
    retmat = np.zeros((len(rels), maxlen)).astype("int32") - 1
    for rel, k in rels:
        rel = [chardic[c] for c in rel]
        retmat[k, :len(rel)] = rel
    return retmat
Exemplo n.º 14
 def readdataset(p, wdic, maxlen=100):
     dataret = []
     goldret = []
     toolong = 0
     realmaxlen = 0
     with open(p) as f:
         data = csv.reader(f, delimiter=",")
         for row in data:
             rowelems = tokenize(row[2])
             realmaxlen = max(realmaxlen, len(rowelems))
             if len(rowelems) > maxlen:
                 toolong += 1
             for rowelem in set(rowelems):
                 if rowelem not in wdic:
                     wdic[rowelem] = len(wdic)
             dataret.append([wdic[x] for x in rowelems])
     print "{} comments were too long".format(toolong)
     maxlen = min(maxlen, realmaxlen)
     datamat = np.ones((len(dataret) - 1, maxlen)).astype("int32") * masksym
     for i in range(1, len(dataret)):
         datamat[i - 1, :min(len(dataret[i]), maxlen)] = dataret[i][:min(len(dataret[i]), maxlen)]
     return datamat, np.asarray(goldret[1:], dtype="int32"), wdic
Exemplo n.º 15
def run(trainp="../fb_train.tsv",
    # worddic
    worddic = {"<RARE>": 0}
    wordcounts = {"<RARE>": 0}
    def addwords(*words):       # adds a word to the worddic
        for word in words:
            if word not in worddic:
                worddic[word] = len(worddic)
            if word not in wordcounts:
                wordcounts[word] = 0
            wordcounts[word] += 1

    # entity names
    entdic = {}
    entmatr = []
    c = 0       # counter
    maxlen = 0
    for line in open(entnames):
        entity_id, entity_label = line[:-1].split("\t")
        entity_label_tokens = tokenize(entity_label)
        entity_label_tokens = entity_label_tokens[:min(len(entity_label_tokens), maxnamelen)]    # ensure taken entity label has no more than maximum allowed number of tokens
        maxlen = max(maxlen, len(entity_label_tokens))
        if entity_id not in entdic:
            entdic[entity_id] = len(entdic)
        if c % 1e3 == 0:
        c += 1

    entswonames = set()
    def add_entity_wo_label(entid):
        assert(entid not in entdic)
        entdic[entid] = len(entdic)

    # relation uri's
    reldic = {}
    relmatr = []
    for line in open(rellist):
        relation_uri = line[:-1]
        relation_uri_tokens = reluri_tokenize(relation_uri)
        relation_uri_tokens = relation_uri_tokens[:min(len(relation_uri_tokens), maxnamelen)]   # ensure max len
        maxlen = max(maxlen, len(relation_uri_tokens))
        if relation_uri not in reldic:
            reldic[relation_uri] = len(reldic)

    maxnamelen = min(maxlen, maxnamelen)

    print len(entdic), len(reldic), len(worddic), maxnamelen

    def getdata(p, maxc=np.infty):
        data = []
        gold = []
        maxlen = 0
        c = 0
        for line in open(p):
            question, answer = line[:-1].split("\t")
            subject, predicate = answer.split(" ")
            question_words = tokenize(question)
            maxlen = max(maxlen, len(question_words))
            if subject not in entdic:
            if predicate not in reldic:
                raise Exception("predicate should be there")
            wordidx = [worddic[x] if x in worddic else worddic["<RARE>"] for x in question_words]
            gold.append([entdic[subject], reldic[predicate]])
            c += 1
            if c % 100 == 0:
                print c
            if c > maxc:
        datamat = np.zeros((c, maxlen)).astype("int32") - 1
        goldmat = np.zeros((c, 2)).astype("int32")
        i = 0
        for x in data:
            datamat[i, :len(x)] = x
            i += 1
        i = 0
        for x in gold:
            goldmat[i, :] = x
            i += 1
        return datamat, goldmat

    traindata = getdata(trainp)
    validdata = getdata(validp)
    testdata = getdata(testp)

    # build ent mat
    entmat = np.zeros((len(entmatr), maxnamelen), dtype="int32") - 1
    for i in range(len(entmatr)):
        x = entmatr[i]
        entmat[i, :len(x)] = [worddic[a] for a in x]
    # build rel mat
    relmat = np.zeros((len(relmatr), maxnamelen), dtype="int32") - 1
    for i in range(len(relmatr)):
        x = relmatr[i]
        relmat[i, :len(x)] = [worddic[a] for a in x]

    # package
    entmat = np.concatenate([entmat, relmat], axis=0)
    numents = len(entdic)
    traindata[1][:, 1] += numents
    validdata[1][:, 1] += numents
    testdata[1][:, 1] += numents
    entdic.update({k: v+numents for k, v in reldic.items()})

    # save
    acc = {
        "train": traindata,
        "valid": validdata,
        "test": testdata,
        "worddic": worddic,
        "entdic": entdic,
        "entmat": entmat,
        "numents": numents,
        "wordcounts": wordcounts,

    print("{} entities without names in datasets".format(len(entswonames)))

    pickle.dump(acc, open(outp, "w"))
Exemplo n.º 16
class SimpleQuestionsLabelIndex(object):
    def __init__(self, host="drogon", index="simplequestions_labels"):
        self.host = host
        self.indexp = index

    def index(self, labelp="labels.map"):
        es = elasticsearch.Elasticsearch(hosts=[self.host])
        except Exception, e:
            print "could not delete index %s" % self.indexp
                "settings": {
                    "index": {
                        "analysis": {
                            "filter": {
                                "pstemmer": {
                                    "type": "porter_stem",
                                    "language": "_english_"
                            "char_filter": {
                                "punctfil": {
                                    "type": "pattern_replace",
                                    "pattern": "[\W]",
                                    "replacement": ""
                            "analyzer": {
                                "myana": {
                                    "tokenizer": "whitespace",
                                    "filter": [
                                    #,"char_filter": ["punctfil"]
                "mappings": {
                    "labelmap": {
                        "properties": {
                            "label": {
                                "type": "string",
                                "analyzer": "myana",
                                "fields": {
                                    "len": {
                                        "type": "token_count",
                                        "store": "yes",
                                        "analyzer": "myana"
        i = 1
        for line in open(labelp):
            k, v = line[:-1].split("\t")
            vt = tokenize(v)
                         "label": " ".join(vt),
                         "fbid": k
            if i % 1000 == 0:
                print i
            i += 1
        print "indexed labels"
Exemplo n.º 17
 def searchsentence(self, s, top=None, topsize=None, exact=True):
     s = tokenize(s)
     ngrams = self.getallngrams(s, topsize)
     return self.searchallngrams(ngrams, top, exact=exact)
Exemplo n.º 18
 def __call__(self, x):
     ret = tokenize(self.labelfy(x[0]))
     ret = ret[: min(self.maxlen, len(ret))]
     return ret, x[1]
Exemplo n.º 19
Exemplo n.º 20
def run(trainp="fb_train.tsv",
    # worddic
    chardic = {" ": 0}
    def updatechardic(*chars):
        for char in chars:
            if char not in chardic:
                chardic[char] = len(chardic)

    # process entity names and relation list
    entdic = {}
    entmatr = []
    entswonames = set()
    c = 0
    maxlen = 0
    for line in open(entnames):
        e, n = line.split("\t")
        nt = " ".join(tokenize(n))
        nt = nt[:min(len(nt), maxnamelen)]
        maxlen = max(maxlen, len(nt))
        if e not in entdic:
            entdic[e] = len(entdic)
        if c % 1e3 == 0:
            print "%.0fk" % (c/1e3)
        c += 1

    def updateentk(*ents):  #ents have not been seen during initial population ==> no titles
        for ent in ents:
            assert(ent not in entdic)
            entdic[ent] = len(entdic)
            entmatr.append([" "])

    reldic = {}
    relmatr = []
    for line in open(rellist):
        r = line[:-1]
        rt = " ".join(tokenize(r))
        rt = rt[:min(len(rt), maxnamelen)]
        maxlen = max(maxlen, len(rt))
        r = "/" + r.replace(".", "/")
        if r not in reldic:
            reldic[r] = len(reldic)

    maxnamelen = min(maxlen, maxnamelen)

    print len(entdic), len(reldic), len(chardic), maxnamelen

    # process data
    def getdata(p, maxc=np.infty):
        data = []
        gold = []
        maxlen = 0
        c = 0
        for line in open(p):
            q, a = (line[:-1] if line[-1] == "\n" else line).split("\t")
            s, p = a.split()
            chars = " ".join(tokenize(q))
            maxlen = max(maxlen, len(chars))
            if s not in entdic:
            if p not in reldic:
                raise Exception("impossibru!")
            charidx = map(lambda x: chardic[x] if x in chardic else chardic[" "], chars)
            gold.append([entdic[s], reldic[p]])
            c += 1
            if c % 100 == 0:
                print c
            if c > maxc:
        datamat = np.zeros((c, maxlen)).astype("int32") - 1
        goldmat = np.zeros((c, 2)).astype("int32")
        i = 0
        for x in data:
            datamat[i, :len(x)] = x
            i += 1
        i = 0
        for x in gold:
            goldmat[i, :] = x
            i += 1
        return datamat, goldmat

    traindata = getdata(trainp)
    validdata = getdata(validp)
    testdata = getdata(testp)

    # build ent mat
    entmat = np.zeros((len(entmatr), maxnamelen), dtype="int32") - 1
    for i in range(len(entmatr)):
        x = entmatr[i]
        entmat[i, :len(x)] = map(lambda a: chardic[a], x)
    # build rel mat
    relmat = np.zeros((len(relmatr), maxnamelen), dtype="int32") - 1
    for i in range(len(relmatr)):
        x = relmatr[i]
        relmat[i, :len(x)] = map(lambda a: chardic[a], x)

    # pre-package tests:
    print entmat.shape[0], len(entdic)
    assert(entmat.shape[0] == len(entdic))
    # package
    entmat = np.concatenate([entmat, relmat], axis=0)
    numents = len(entdic)
    traindata[1][:, 1] += numents
    validdata[1][:, 1] += numents
    testdata[1][:, 1] += numents
    reldic = {k: v+numents for k, v in reldic.items()}

    # save
    acc = {
        "train": traindata,
        "valid": validdata,
        "test":  testdata,
        "chardic": chardic,
        "entdic": entdic,
        "entmat": entmat,
        "numents": numents

    print "%d entities without names in datasets" % len(entswonames)

    pickle.dump(acc, open(outp, "w"))
Exemplo n.º 21
    def predict(self,
        print multiprune
        assert (relsperent is None or relcans is None)
        assert (relsperent is not None or relcans is not None)
        assert (entcans is not None)
        rankedsubjs = self.ranksubjects(entcans)
        bestsubjs = [x[0][0] for x in rankedsubjs]
        if relcans is not None:
            rankedrels = self.rankrelations(relcans)
            bestrels = [x[0][0] for x in rankedrels]
            if multiprune <= 0:
                relcans = [
                    relsperent[bestsubj][0] if bestsubj in relsperent else []
                    for bestsubj in bestsubjs
                rankedrels = self.rankrelations(relcans)
                bestrels = [x[0][0] for x in rankedrels]
                print "multipruning !!!!!!!!!!!!!!!!!"
                topk = multiprune  # TOP K !!!!!!!!!!!!!!!!!!!!!!!!!!!!
                # get relcans
                relcans = []
                for subjranking in rankedsubjs:
                    toplabel = None
                    relcanse = []
                    i = 0
                    for subj, score in subjranking:
                        subjlabel = " ".join(
                            tokenize(self.subjinfo[subj][0]) if subj in
                            self.subjinfo else [])
                        topcan = None
                        if toplabel is None:
                            toplabel = subjlabel
                            topcan = subj
                        elif subjlabel == toplabel:
                            topcan = subj
                        elif i < topk:
                            topcan = subj
                        toadd = relsperent[topcan][
                            0] if topcan in relsperent else []
                        i += 1
                # rank relations
                rankedrels = self.rankrelations(relcans)
                bestrels = [x[0][0] for x in rankedrels]
                # build ents per relation
                entsperrel = {}
                for ent, rels in relsperent.items():
                    for rel in rels[0]:
                        if rel not in entsperrel:
                            entsperrel[rel] = set()
                # filter rankedsubjs
                filteredrankedsubjs = []
                for i in range(len(rankedsubjs)):
                    for subj, score in rankedsubjs[i]:
                        if bestrels[i] in entsperrel and \
                                        subj in entsperrel[bestrels[i]]:
                            filteredrankedsubjs[i].append((subj, score))
                    if len(filteredrankedsubjs[i]) == 0:
                        filteredrankedsubjs[i].append((-1, -1.))
                bestsubjs = [x[0][0] for x in filteredrankedsubjs]

        ret = np.concatenate([
            np.expand_dims(np.asarray(bestsubjs, dtype="int32"), axis=1),
            np.expand_dims(np.asarray(bestrels, dtype="int32"), axis=1)
        return ret
Exemplo n.º 22
Exemplo n.º 23
def getwords(s):
    return tokenize(s)
Exemplo n.º 24
Exemplo n.º 25
def getdata(p, worddic, chardic, entdic, reldic, maxc=np.infty, maxchar=30):
    data = []
    gold = []
    maxlen = 0
    maxwordlen = 0
    c = 0
    for line in open(p):
        q, a = (line[:-1] if line[-1] == "\n" else line).split("\t")
        s, p = a.split()
        words = tokenize(q)
        maxlen = max(maxlen, len(words))
        for word in words:
            maxwordlen = max(maxwordlen, len(word))
            if word not in worddic:
                worddic[word] = len(worddic)
        if s not in entdic:
            entdic[s] = len(entdic)
        if p not in reldic:
            reldic[p] = len(reldic)
        gold.append([entdic[s], reldic[p]])
        c += 1
        if c > maxc:
    print maxwordlen
    maxchar = min(maxchar, maxwordlen)
    wordmat = np.zeros((c, maxlen)).astype("int32") - 1
    charten = np.zeros((c, maxlen, maxchar)).astype("int32") - 1
    goldmat = np.zeros((c, 2)).astype("int32")
    i = 0
    for sent in data:
        j = 0
        for word in sent:
            if len(word) > maxchar:
                print word
            wordmat[i, j] = worddic[word]
            chars = map(ord, word)
            charten[i, j, :min(len(chars), maxchar
                               )] = chars[:min(len(chars), maxchar)]
            j += 1
        i += 1
    i = 0
    for x in gold:
        goldmat[i, :] = x
        i += 1
    # making chardic and transforming through chardic
    thischardic = dict(map(lambda (x, y): (ord(x), y), chardic.items()))
    nextid = 0
    while nextid in thischardic.values():
        nextid += 1
    uniquechars = np.unique(charten)
    for uniquechar in list(uniquechars):
        if not uniquechar in thischardic and uniquechar >= 0:
            thischardic[uniquechar] = nextid
            while nextid in thischardic.values():
                nextid += 1
    chardic.update(dict(map(lambda (x, y): (chr(x), y), thischardic.items())))
    print len(chardic), chardic
    charten = np.vectorize(lambda x: thischardic[x] if x >= 0 else x)(charten)
    datamat = np.concatenate([wordmat.reshape(wordmat.shape + (1, )), charten],
    return datamat, goldmat
Exemplo n.º 26
def run(trainp="fb_train.tsv",
    # worddic
    worddic = {"<RARE>": 0}

    def updateworddic(*words):
        for word in words:
            if word not in worddic:
                worddic[word] = len(worddic)

    # process entity names and relation list
    entdic = {}
    entmatr = []
    entswonames = set()
    c = 0
    maxlen = 0
    for line in open(entnames):
        e, n = line.split("\t")
        nt = tokenize(n)
        nt = nt[:min(len(nt), maxnamelen)]
        maxlen = max(maxlen, len(nt))
        if e not in entdic:
            entdic[e] = len(entdic)
        if c % 1e3 == 0:
            print "%.0fk" % (c / 1e3)
        c += 1

    def updateentk(
    ):  #ents have not been seen during initial population ==> no titles
        for ent in ents:
            assert (ent not in entdic)
            entdic[ent] = len(entdic)

    reldic = {}
    relmatr = []
    for line in open(rellist):
        r = line[:-1]
        rt = tokenize(r)
        rt = rt[:min(len(rt), maxnamelen)]
        maxlen = max(maxlen, len(rt))
        r = "/" + r.replace(".", "/")
        if r not in reldic:
            reldic[r] = len(reldic)

    maxnamelen = min(maxlen, maxnamelen)

    print len(entdic), len(reldic), len(worddic), maxnamelen

    # process data
    def getdata(p, maxc=np.infty):
        data = []
        gold = []
        maxlen = 0
        c = 0
        for line in open(p):
            q, a = (line[:-1] if line[-1] == "\n" else line).split("\t")
            s, p = a.split()
            words = tokenize(q)
            maxlen = max(maxlen, len(words))
            if s not in entdic:
            if p not in reldic:
                raise Exception("impossibru!")
            wordidx = map(
                lambda x: worddic[x]
                if x in worddic else worddic["<RARE>"], words)
            gold.append([entdic[s], reldic[p]])
            c += 1
            if c % 100 == 0:
                print c
            if c > maxc:
        datamat = np.zeros((c, maxlen)).astype("int32") - 1
        goldmat = np.zeros((c, 2)).astype("int32")
        i = 0
        for x in data:
            datamat[i, :len(x)] = x
            i += 1
        i = 0
        for x in gold:
            goldmat[i, :] = x
            i += 1
        return datamat, goldmat

    traindata = getdata(trainp)
    validdata = getdata(validp)
    testdata = getdata(testp)

    # build ent mat
    entmat = np.zeros((len(entmatr), maxnamelen), dtype="int32") - 1
    for i in range(len(entmatr)):
        x = entmatr[i]
        entmat[i, :len(x)] = map(lambda a: worddic[a], x)
    # build rel mat
    relmat = np.zeros((len(relmatr), maxnamelen), dtype="int32") - 1
    for i in range(len(relmatr)):
        x = relmatr[i]
        relmat[i, :len(x)] = map(lambda a: worddic[a], x)

    # pre-package tests:
    print entmat.shape[0], len(entdic)
    assert (entmat.shape[0] == len(entdic))
    # package
    entmat = np.concatenate([entmat, relmat], axis=0)
    numents = len(entdic)
    traindata[1][:, 1] += numents
    validdata[1][:, 1] += numents
    testdata[1][:, 1] += numents
    reldic = {k: v + numents for k, v in reldic.items()}

    # save
    acc = {
        "train": traindata,
        "valid": validdata,
        "test": testdata,
        "worddic": worddic,
        "entdic": entdic,
        "entmat": entmat,
        "numents": numents

    print "%d entities without names in datasets" % len(entswonames)

    pickle.dump(acc, open(outp, "w"))
Exemplo n.º 27
 def processline(self, x):
     return " ".join(tokenize(x))
Exemplo n.º 28
def reluri_tokenize(reluri):
    return tokenize(reluri.replace("/", " ").replace("_", " "))