예제 #1
0
def dense2(fof, outfof):
    fnames = util.getfnames(fof)
    isdir = os.path.isdir(fof)
    random.seed(1234567)

    for fname in fnames:

        outlines = []
        with codecs.open(fname, "r", "utf8") as f:
            lines = f.readlines()

        for line in lines:
            sline = line.split("\t")
            if len(sline) > 5:
                tag = sline[0]
                if tag == "O":
                    # maybe add, maybe don't
                    if random.random() < 0.02:
                        outlines.append(line)
                    else:
                        outlines.append("\n")
                    
                else:
                    outlines.append(line)
            else:
                outlines.append(line)

        if isdir:
            outfname = outfof + "/" + os.path.basename(fname)
        else:
            outfname = outfof
        # if outlines isn't empty...
        with codecs.open(outfname, "w", "utf8") as out:
            for line in outlines:
                out.write(line)
예제 #2
0
def dense2(fof, outfof):
    fnames = util.getfnames(fof)
    isdir = os.path.isdir(fof)
    random.seed(1234567)

    for fname in fnames:

        outlines = []
        with codecs.open(fname, "r", "utf8") as f:
            lines = f.readlines()

        for line in lines:
            sline = line.split("\t")
            if len(sline) > 5:
                tag = sline[0]
                if tag == "O":
                    # maybe add, maybe don't
                    if random.random() < 0.02:
                        outlines.append(line)
                    else:
                        outlines.append("\n")

                else:
                    outlines.append(line)
            else:
                outlines.append(line)

        if isdir:
            outfname = outfof + "/" + os.path.basename(fname)
        else:
            outfname = outfof
        # if outlines isn't empty...
        with codecs.open(outfname, "w", "utf8") as out:
            for line in outlines:
                out.write(line)
예제 #3
0
def getstats(fof):
    """
    Given a directory dr containing CoNLL files,
    collect tag and token statistics over the corpus.
    """

    if "," in fof:
        fofs = fof.split(",")

        fnames = []
        for f in fofs:
            fnames.extend(getfnames(f))
    else:
        fnames = getfnames(fof)

    toks = 0
    totaltags = 0
    totalstarttags = 0
    tags = defaultdict(int)

    for fname in fnames:
        with open(fname) as f:
            for line in f:
                if len(line.strip()) == 0: continue
                if "DOCSTART" in line: continue
                toks += 1

                tag = line.strip().split()[0]

                if tag != "O":
                    tags[tag] += 1
                    totaltags += 1
                    if tag[0] == "B":
                        totalstarttags += 1

    print("Documents: {0}".format(len(fnames)))
    print("Tokens: {0}".format(toks))
    print("Total nes: {0}".format(totalstarttags))
    print("Total ne tokens: {0}".format(totaltags))
    print("NE percentage: {0}%".format(round(100 * totaltags / float(toks),
                                             2)))
    print("Tag dict:")

    tags = sorted(tags.items())

    for t, v in tags:
        print(t, ":", v)
예제 #4
0
def getstats(fof):
    """
    Given a directory dr containing CoNLL files,
    collect tag and token statistics over the corpus.
    """

    if "," in fof:
        fofs = fof.split(",")

        fnames = []
        for f in fofs:
            fnames.extend(getfnames(f))
    else:
        fnames = getfnames(fof)
    
    toks = 0
    totaltags = 0
    totalstarttags = 0
    tags = defaultdict(int)
    
    for fname in fnames:
        with open(fname) as f:
            for line in f:
                if len(line.strip()) == 0: continue
                if "DOCSTART" in line: continue
                toks += 1

                tag = line.strip().split()[0]

                if tag != "O":
                    tags[tag] += 1
                    totaltags += 1
                    if tag[0] == "B":
                        totalstarttags += 1
                
    print("Documents: {0}".format(len(fnames)))
    print("Tokens: {0}".format(toks))
    print("Total nes: {0}".format(totalstarttags))
    print("Total ne tokens: {0}".format(totaltags))
    print("NE percentage: {0}%".format(round(100*totaltags / float(toks), 2)))
    print("Tag dict:")

    tags = sorted(tags.items())
    
    for t,v in tags:
        print(t,":",v)
예제 #5
0
def convert(fof, outfof=""):
    """ Given a file or folder of text files, this will convert to conll files."""
    
    fnames = getfnames(fof)

    for fname in fnames:    
        with codecs.open(fname,"r", "utf8") as f:
            lines = f.readlines()

        fnonly = os.path.basename(fname)

        outname = os.path.join(outfof, fnonly + ".conll")        
        out = codecs.open(outname, "w", "utf8")
        
        i = 1
        for line in lines:
            #if line[0] == "#":
            #    out.write(line)
            #    continue

            # if you come across an empty line, don't write anything.
            if len(line.strip()) == 0:
                continue

            sline = line.split()
            

            for word in sline:                
                if len(word) == 0:
                    continue
                
                while len(word) > 0 and word[0] in punc:
                    out.write("\t".join(["O", "0", str(i), "x", "x", word[0], "x", "x", "0\n"]))
                    word = word[1:]
                    i += 1

                after = []
                while len(word) > 0 and word[-1] in punc:
                    # insert so the order is correct.
                    after.insert(0, word[-1])
                    word = word[:-1]
                    
                if len(word) > 0:
                    # Now word is pure, unfiltered...
                    out.write("\t".join(["O", "0", str(i), "x", "x", word, "x", "x", "0\n"]))

                    for pn in after:
                        i += 1
                        out.write("\t".join(["O", "0", str(i), "x", "x", pn, "x", "x", "0\n"]))

                i += 1
                
            out.write("\n")


        out.close()

        print("Wrote to: ", outname)
예제 #6
0
def convert(fof, outfof="", outformat="iob2"):
    """ Given a file or folder of text files, this will convert to conll files.
    Assume that the input is always iob2. 

    """

    fnames = getfnames(fof)

    for fname in fnames:
        with codecs.open(fname, "r", "utf8") as f:
            lines = f.readlines()

        fnonly = os.path.basename(fname)

        outname = os.path.join(outfof, fnonly + ".conll")
        out = codecs.open(outname, "w", "utf8")

        i = 1
        last = ""
        for line in lines:
            line = line.strip()

            if len(line.strip()) == 0:
                out.write(line + "\n")
                last = ""
            else:
                sline = line.split("\t")
                if len(sline) == 1:
                    sline.insert(0, "@@@")

                word, tag = sline
                if outformat == "iob1":
                    prefix = tag[0]
                    label = tag[2:]

                    if tag != "O":
                        if last == label and prefix == "B":
                            prefix = "B"
                        else:
                            prefix = "I"
                        tag = "{}-{}".format(prefix, label)
                    last = label

                #out.write("\t".join([tag.upper(), "0", str(i), "x", "x", word, "x", "x", "0\n"]))
                out.write(" ".join([word, "O", "O", tag.upper() + "\n"]))
                i += 1

        out.close()

        print("Wrote to: ", outname)
예제 #7
0
def func(folder, outfolder, labels=[], words=[]):
    fnames = getfnames(folder)

    isfolder = os.path.isdir(folder)

    labdict = {}
    for lab in labels:
        ls= lab.split(":")
        if len(ls) == 1:
            labdict[ls] = "O"
        else:
            labdict[ls[0]] = ls[1]
    
    for fname in fnames:
        with open(fname) as f:
            lines = f.readlines()
        outlines = []
        for line in lines:            
            sline = line.split("\t")
            if len(sline) > 5:
                
                if labels == [] and words == []:
                    sline[0] = "O"
                elif len(labels) > 0 and words == []:
                    if sline[0] in labdict:
                        sline[0] = labdict[sline[0]]
                elif len(labels) == 0 and len(words) > 0:
                    if wordstart(sline[5], words):
                        sline[0] = "O"
                elif len(labels) > 0 and len(words) > 0: 
                    if sline[0] in labdict and wordstart(sline[5], words):
                        sline[0] = "O"
                    
            outlines.append("\t".join(sline))
            
        if isfolder:
            fnonly = os.path.basename(fname)
            outpath = outfolder + "/"+ fnonly
        else:
            outpath = outfolder
            
        with open(outpath, "w") as out:
            for line in outlines:
                out.write(line);
예제 #8
0
def preparewords(fof, outfile):
    """
    puts all words in one big line. Typically used for
    training a word2vec model

    TODO: consider cleaning punctuation.
    """

    fnames = getfnames(fof)

    print("writing words to", outfile)
    with codecs.open(outfile, "w", "utf-8") as out:
        for fname in fnames:
            with codecs.open(fname, "r", "utf-8") as f:
                lines = f.readlines()
            for line in lines:
                sline = line.split("\t")
                if len(sline) > 5:
                    out.write(sline[5] + " ")
예제 #9
0
def preparesents(fof, outfile):
    """ Reads CoNLL files to create a file to be used for
    language model training. One sentence per line. """

    fnames = getfnames(fof)

    print("writing sentences to", outfile)
    with codecs.open(outfile, "w", "utf-8") as out:
        for fname in fnames:
            with codecs.open(fname, "r", "utf-8") as f:
                lines = f.readlines()

            currsent = ""
            for line in lines:
                sline = line.split("\t")
                if len(sline) > 5:
                    currsent += sline[5] + " "
                else:
                    out.write(currsent.strip() + "\n")
                    currsent = ""
예제 #10
0
def densify(fof, outfof, window=-1):
    """ Given a file, this will densify the file. That is, keep only tokens
    within a window of labels. By default (window=-1), this does nothing."""

    fnames = util.getfnames(fof)
    isdir = os.path.isdir(fof)

    for fname in fnames:

        outlines = set()
        i = 0

        with codecs.open(fname, "r", "utf8") as f:
            lines = f.readlines()

        if window == -1:
            outwrite = lines
            containscontent = True
        else:
            for line in lines:
                sline = line.split("\t")
                if len(sline) > 5:
                    tag = sline[0]
                    if tag != "O":
                        # this is a label.
                        # add w before and w after.
                        # don't even need to worry about range checking!
                        for j in range(i, i - window - 1, -1):
                            if j < 0 or len(lines[j].strip()) == 0:
                                break
                            outlines.add(j)

                        for j in range(i, i + window + 1):
                            if j >= len(lines) or len(lines[j].strip()) == 0:
                                break
                            outlines.add(j)
                else:
                    outlines.add(i)

                i += 1

            # conflate empty lines.
            outwrite = []
            lastlinewasempty = False
            containscontent = False
            for i, line in enumerate(lines):
                if i in outlines:
                    isempty = len(line.strip()) == 0
                    if isempty:
                        if not lastlinewasempty:
                            lastlinewasempty = True
                            outwrite.append(line)
                    else:
                        containscontent = True
                        outwrite.append(line)
                        lastlinewasempty = False

        if isdir:
            outfname = outfof + "/" + os.path.basename(fname)
        else:
            outfname = outfof

        # if outlines isn't empty...
        if containscontent:
            with codecs.open(outfname, "w", "utf8") as out:
                for line in outwrite:
                    out.write(line)
예제 #11
0
def getstats(folders):
    if len(folders) > 2:
        print(">2 folders is not supported. Will only operate on {} and {}".
              format(folders[0], folders[1]))

    # this will only ever have two elements
    namedicts = []
    tokendicts = []
    tagdicts = []

    for folder in folders:
        files = util.getfnames(folder)
        names = defaultdict(int)
        tags = defaultdict(int)
        tokens = defaultdict(int)
        weights = 0
        nametokens = 0
        sentences = 0

        for f in files:
            cdoc = readconll(f)

            sentences += len(cdoc.sentenceends)

            for t in cdoc.tokens:
                if t.weight > 0:
                    tokens[t.s] += t.weight
                weights += t.weight

            for c in cdoc.getconstituents():
                # assume that all tokens in a constituent share the same weight.
                for t in c.tokens:
                    if t.weight > 0:
                        nametokens += t.weight
                        names[c.label + "\t" + c.surf()] += t.weight
                        tags[c.label] += t.weight

        namedicts.append(names)
        tokendicts.append(tokens)
        tagdicts.append(tags)

        numtokens = sum(tokens.values())
        numnames = sum(names.values())
        uniqnames = len(names.keys())

        try:
            reps = numnames / float(uniqnames)
            uniqs = uniqnames / float(numnames)
            tagpercentage = nametokens / numtokens
            weightedtagpercentage = nametokens / weights
        except ZeroDivisionError:
            reps = 0.0
            uniqs = 0.0
            tagpercentage = 0.
            weightedtagpercentage = 0.

        print("{}: {}".format("Folder", folder))
        print(" {:<20}{:>10}".format("Documents", len(files)))
        print(" {:<20}{:>10,}".format("Num tokens", numtokens))
        print(" {:<20}{:>10,}".format("Num unique tokens", len(tokens.keys())))
        print(" {:<20}{:>10,}".format("Num sentences", sentences))
        print(" {:<20}{:>10,}".format("Num names", numnames))
        print(" {:<20}{:>10,}".format("Num name tokens", nametokens))
        print(" {:<20}{:>10,}".format("Num unique name tokens", uniqnames))
        print(" {:<20}{:>10.2}".format("Avg num repetitions", reps))
        print(" {:<20}{:>10.2}".format("Unique / total", uniqs))
        print(" {:<20}{:>10.2%}".format("Tag %", tagpercentage))
        print(" {:<20}{:>10.2%}".format("Weighted Tag %",
                                        weightedtagpercentage))
        print(" Tag dict")
        for t in sorted(tags):
            print("  {}: {} ({:.2%})".format(t, tags[t],
                                             tags[t] / float(numnames)))

    if len(namedicts) == 2:
        print("Comparison:")

        if tagdicts[0].keys() != tagdicts[1].keys():
            print(" ***Mismatching tag set!***")

        namecos, nameinter, nameweight = cos(namedicts[0], namedicts[1])
        vocabcos, vocabinter, vocabweight = cos(tokendicts[0], tokendicts[1])
        vocabkl, inter = KL(tokendicts[0], tokendicts[1])
        tagcos, taginter, tagweight = cos(tagdicts[0], tagdicts[1])

        print(" Names cos sim: {}".format(namecos))
        print(" Vocab cos sim: {}".format(vocabcos))
        print(" Vocab overlap: {}".format(vocabinter))
        print(" Vocab KL divg: {}".format(vocabkl))
        print(" Tag cos sim: {}".format(tagcos))

        numtestnames = sum(namedicts[1].values())
        print(" %names in test seen in training: {}".format(
            nameinter / float(numtestnames)))

        numtesttokens = sum(tokendicts[1].values())
        print(" %tokens in test seen in training: {}".format(
            vocabinter / float(numtesttokens)))
예제 #12
0
def func(folder, outfolder, mention):
    random.seed(1234567)

    fnames = getfnames(folder)
    isfolder = os.path.isdir(folder)

    # word frequencies.
    wfreq = defaultdict(int)
    weightmass = defaultdict(int)

    # make a first pass to gather word
    # frequencies and distances from entities.
    dists = defaultdict(lambda: defaultdict(int))
    
    # is entity?
    def isent(s):
        return s != "O"

    for fname in fnames:

        # this measures the distance from token i to the nearest
        # named entity

        lastindex = 0

        # boolean for if cond:
        # the previous entity boundary was a sentence
        prevempty = True

        with open(fname) as f:
            lines = f.readlines()

        for i, line in enumerate(lines):
            sline = line.split("\t")
            if len(sline) > 5:
                wfreq[sline[5]] += 1

            ent = isent(sline[0])
            empty = len(sline) < 5
            if ent or empty:
                dists[fname][i] = 0
                # when you see this, then go over all tokens since then...
                dd = i - lastindex

                startbias = 0
                endbias = 0

                if empty and prevempty:
                    # these are just large nubmers...
                    startbias = 10000
                    endbias = 10000
                elif empty:
                    endbias = 10000
                elif prevempty:
                    startbias = 10000

                for j in range(lastindex+1, i):
                    disttolast = j - lastindex
                    disttonext = dd - disttolast
                    dists[fname][j] = min(disttolast + startbias, disttonext + endbias)
                lastindex = i
                prevempty = empty


    # normalize the word frequencies
    mx = max(wfreq.values())
    for w in wfreq:
        wfreq[w] /= mx

    for fname in fnames:
        with open(fname) as f:
            lines = f.readlines()
        outlines = []
        for i, line in enumerate(lines):
            
            sline = line.strip().split("\t")
            if len(sline) > 5:

                sline[7] = str(dists[fname][i])

                if (sline[5] in punc or isnum(sline[5])):
                    theta = 1.0
                    sline[6] = 0.0 if random.random() > theta else 1.0
                elif sline[0] == "O":
                    # probability of including
                    #theta = wfreq[sline[5]]
                    theta = 0.01
                    sline[6] = 0.0 if random.random() > theta else 1.0
                    #sline[6] = 0.0
                    #sline[6] = windowed(dists[fname][i])
                    #sline[6] = 1.0
                    #sline[6] = softwindowed(dists[fname][i])
                    #sline[6] = 
                    #sline[6] = rand()
                else:
                    # probabil3y of including
                    theta = 0.3
                    sline[6] = 0.0 if random.random() > theta else 1.0

                weightmass[sline[0]] += sline[6]
                sline[6] = str(sline[6])
                sline[8] = str(wfreq[sline[5]])
                outlines.append("\t".join(sline) + "\n")
            else:
                outlines.append("\n")

        if isfolder:
            fnonly = os.path.basename(fname)
            outpath = outfolder + "/" + fnonly
        else:
            outpath = outfolder

        with open(outpath, "w") as out:
            for line in outlines:
                out.write(line)

    for tag in sorted(weightmass):
        print("{}: {}".format(tag, weightmass[tag]))
    tags = 0
    for k in weightmass:
        if k == "O":
            continue
        tags += weightmass[k]
    print("Final ratio R: {:.2%}".format(tags / sum(weightmass.values())))
예제 #13
0
def func(folder, outfolder, precision, recall):
    # This is v1 seed:
    random.seed(1234567)
    # v2 seed:
    #random.seed(4343)

    fnames = getfnames(folder)

    # this contains the sets of constituents and frequencies
    d = defaultdict(int)

    namesdocs = []
    total = 0
    labels = list()

    for fname in fnames:
        cdoc = readconll(fname)
        cons = cdoc.getconstituents()

        namesdocs.append((fname, cdoc))

        for c in cons:
            d[c.surf()] += 1
            total += 1
            labels.append(c.label)

    labels = list(labels)

    # build the set of names we will keep...
    goal = recall * total
    currnum = 0
    activecons = set()

    # impose ordering
    ditems = sorted(d.items())

    # make it random, but consistent.
    random.shuffle(ditems)

    for c, freq in ditems:
        activecons.add(c)
        currnum += freq
        if currnum >= goal:
            break

    print("Writing to {}".format(outfolder))
    for fname, cdoc in namesdocs:
        cons = cdoc.getconstituents()

        numpos = len(cons)
        for con in cons:
            # discard all the names we don't keep.
            if con.surf() not in activecons:
                # print(con)
                cdoc.removeconstituent(con)
                numpos -= 1

        badspanstoadd = math.ceil(numpos / precision - numpos)
        for _ in range(badspanstoadd):
            start = random.randrange(0, len(cdoc.tokens) - 5)
            length = random.randrange(1, 3)
            end = start + length
            randlabel = random.choice(labels)
            newcon = Constituent(randlabel, cdoc.tokens[start:end],
                                 (start, end))
            cdoc.addconstituent(newcon)

        with open(outfolder + "/" + os.path.basename(fname), "w") as out:
            cdoc.write(out)
예제 #14
0
def func(folder, outfolder, mention):
    random.seed(1234567)

    fnames = getfnames(folder)
    isfolder = os.path.isdir(folder)

    # word frequencies.
    wfreq = defaultdict(int)
    weightmass = defaultdict(int)

    # make a first pass to gather word
    # frequencies and distances from entities.
    dists = defaultdict(lambda: defaultdict(int))

    # is entity?
    def isent(s):
        return s != "O"

    for fname in fnames:

        # this measures the distance from token i to the nearest
        # named entity

        lastindex = 0

        # boolean for if cond:
        # the previous entity boundary was a sentence
        prevempty = True

        with open(fname) as f:
            lines = f.readlines()

        for i, line in enumerate(lines):
            sline = line.split("\t")
            if len(sline) > 5:
                wfreq[sline[5]] += 1

            ent = isent(sline[0])
            empty = len(sline) < 5
            if ent or empty:
                dists[fname][i] = 0
                # when you see this, then go over all tokens since then...
                dd = i - lastindex

                startbias = 0
                endbias = 0

                if empty and prevempty:
                    # these are just large nubmers...
                    startbias = 10000
                    endbias = 10000
                elif empty:
                    endbias = 10000
                elif prevempty:
                    startbias = 10000

                for j in range(lastindex + 1, i):
                    disttolast = j - lastindex
                    disttonext = dd - disttolast
                    dists[fname][j] = min(disttolast + startbias,
                                          disttonext + endbias)
                lastindex = i
                prevempty = empty

    # normalize the word frequencies
    mx = max(wfreq.values())
    for w in wfreq:
        wfreq[w] /= mx

    for fname in fnames:
        with open(fname) as f:
            lines = f.readlines()
        outlines = []
        for i, line in enumerate(lines):

            sline = line.strip().split("\t")
            if len(sline) > 5:

                sline[7] = str(dists[fname][i])

                if (sline[5] in punc or isnum(sline[5])):
                    theta = 1.0
                    sline[6] = 0.0 if random.random() > theta else 1.0
                elif sline[0] == "O":
                    # probability of including
                    #theta = wfreq[sline[5]]
                    theta = 0.01
                    sline[6] = 0.0 if random.random() > theta else 1.0
                    #sline[6] = 0.0
                    #sline[6] = windowed(dists[fname][i])
                    #sline[6] = 1.0
                    #sline[6] = softwindowed(dists[fname][i])
                    #sline[6] =
                    #sline[6] = rand()
                else:
                    # probabil3y of including
                    theta = 0.3
                    sline[6] = 0.0 if random.random() > theta else 1.0

                weightmass[sline[0]] += sline[6]
                sline[6] = str(sline[6])
                sline[8] = str(wfreq[sline[5]])
                outlines.append("\t".join(sline) + "\n")
            else:
                outlines.append("\n")

        if isfolder:
            fnonly = os.path.basename(fname)
            outpath = outfolder + "/" + fnonly
        else:
            outpath = outfolder

        with open(outpath, "w") as out:
            for line in outlines:
                out.write(line)

    for tag in sorted(weightmass):
        print("{}: {}".format(tag, weightmass[tag]))
    tags = 0
    for k in weightmass:
        if k == "O":
            continue
        tags += weightmass[k]
    print("Final ratio R: {:.2%}".format(tags / sum(weightmass.values())))
예제 #15
0
def densify(fof, outfof, window=-1):
    """ Given a file, this will densify the file. That is, keep only tokens
    within a window of labels. By default (window=-1), this does nothing."""

    fnames = util.getfnames(fof)
    isdir = os.path.isdir(fof)

    for fname in fnames:

        outlines = set()
        i = 0
        
        with codecs.open(fname, "r", "utf8") as f:
            lines = f.readlines()

        if window == -1:
            outwrite = lines
            containscontent = True
        else:
            for line in lines:
                sline = line.split("\t")
                if len(sline) > 5:
                    tag = sline[0]
                    if tag != "O":
                        # this is a label.
                        # add w before and w after.
                        # don't even need to worry about range checking!
                        for j in range(i, i-window-1, -1):                            
                            if j < 0 or len(lines[j].strip()) == 0:
                                break
                            outlines.add(j)


                        for j in range(i, i+window+1):
                            if j >= len(lines) or len(lines[j].strip()) == 0:
                                break
                            outlines.add(j)
                else:
                    outlines.add(i)

                i += 1

            # conflate empty lines.
            outwrite = []
            lastlinewasempty = False
            containscontent = False
            for i,line in enumerate(lines):
                if i in outlines:
                    isempty = len(line.strip()) == 0
                    if isempty:
                        if not lastlinewasempty:
                            lastlinewasempty = True
                            outwrite.append(line);
                    else:
                        containscontent = True
                        outwrite.append(line);
                        lastlinewasempty = False

        if isdir:
            outfname = outfof + "/" + os.path.basename(fname)
        else:
            outfname = outfof
                    
        # if outlines isn't empty...
        if containscontent:
            with codecs.open(outfname , "w", "utf8") as out:
                for line in outwrite:
                    out.write(line)
예제 #16
0
def func(folder, outfolder, mention, methods, defaultweight):
    """ methods is a list. Can contain: punc, window, softwindow, freq, uniform, random """
    
    random.seed(1234567)

    fnames = getfnames(folder)
    isfolder = os.path.isdir(folder)

    # word frequencies.
    wfreq = defaultdict(int)
    # make a first pass to gather word
    # frequencies and distances from entities.
    dists = defaultdict(lambda: defaultdict(int))
    
    # is entity?
    def isent(s):
        return s != "O"

    for fname in fnames:

        # this measures the distance from token i to the nearest
        # named entity

        lastindex = 0

        # boolean for if cond:
        # the previous entity boundary was a sentence
        prevempty = True

        with open(fname) as f:
            lines = f.readlines()

        for i, line in enumerate(lines):
            sline = line.split("\t")
            if len(sline) > 5:
                wfreq[sline[5]] += 1

            ent = isent(sline[0])
            empty = len(sline) < 5
            if ent or empty:
                dists[fname][i] = 0
                # when you see this, then go over all tokens since then...
                dd = i - lastindex

                startbias = 0
                endbias = 0

                if empty and prevempty:
                    # these are just large nubmers...
                    startbias = 10000
                    endbias = 10000
                elif empty:
                    endbias = 10000
                elif prevempty:
                    startbias = 10000

                for j in range(lastindex+1, i):
                    disttolast = j - lastindex
                    disttonext = dd - disttolast
                    dists[fname][j] = min(disttolast + startbias, disttonext + endbias)
                lastindex = i
                prevempty = empty


    # normalize the word frequencies
    mx = max(wfreq.values())
    for w in wfreq:
        wfreq[w] /= mx

    for method in methods:
        if len(method) > 0 and method not in allowedmethods:
            print("Warning: {} not a supported method. Ignoring.".format(method))
        
    for fname in fnames:
        with open(fname) as f:
            lines = f.readlines()
        outlines = []
        for i, line in enumerate(lines):
            
            sline = line.strip().split("\t")
            if len(sline) > 5:

                sline[7] = str(dists[fname][i])

                if mention:
                    # in this case, we don't want weights.
                    sline[6] = 1.0
                    if sline[0] != "O":
                        sline[0] = "B-MNT"
                elif sline[0] == "O":

                    if defaultweight is not None:
                        sline[6] = defaultweight
                    try:
                        sline[6] = float(sline[6])
                    except Exception:
                        sline[6] = 0.0
                    
                    if "random" in methods:
                        if random.random() < 0.25:
                            sline[6] += 1.0
                    
                    # These all give weights to all methods.
                    if "softwindow" in methods:
                        sline[6] += softwindow(dists[fname][i])
                        
                    if "freq" in methods:
                        sline[6] += freq(wfreq[sline[5]])
                        
                    if "uniform" in methods:
                        sline[6] += uniform()

                    # The following give weights to just a few.
                    if "punc" in methods:
                        if sline[5] in punc or isnum(sline[5]):
                            sline[6] += 1.0

                    if "window" in methods:
                        if dists[fname][i] <= 1:
                            sline[6] += 1.0
                        
                    if sline[6] == "x":
                        sline[6] = 0.0
                else:
                    sline[6] = 1.0

                sline[6] = str(sline[6])
                sline[8] = str(wfreq[sline[5]])
                outlines.append("\t".join(sline) + "\n")
            else:
                outlines.append("\n")

        if isfolder:
            fnonly = os.path.basename(fname)
            outpath = outfolder + "/" + fnonly
        else:
            outpath = outfolder

        with open(outpath, "w") as out:
            for line in outlines:
                out.write(line)
예제 #17
0
def func(fof1, fof2, ignore=False):
    print("THIS ONLY RETURNS TOKEN LEVEL")
    fnames1 = sorted(getfnames(fof1))
    fnames2 = sorted(getfnames(fof2))

    labels = set()
    gold = []
    pred = []
    for f1, f2 in zip(fnames1, fnames2):
        print(f1, f2)

        try:
            with open(f1) as f:
                lines = f.readlines()

            with open(f2) as f:
                lines2 = f.readlines()
        except IOError as e:
            print(e)
            continue

        i = 0
        j = 0
        total = 0
        while i < len(lines) and j < len(lines2):
            sline = lines[i].split("\t")
            sline2 = lines2[j].split("\t")

            try:
                while "-DOCSTART-" in lines[i] or lines[i].strip() == "":
                    i += 1
                    sline = lines[i].split("\t")

                while "-DOCSTART-" in lines2[j] or lines2[j].strip() == "":
                    j += 1
                    sline2 = lines2[j].split("\t")
            except IndexError:
                break

            if len(sline) < 5:
                continue

            predweight = sline2[6]
            total += 1
            if ignore and float(predweight) == 0.0:
                pass
            else:
                gold.append(sline[0])
                pred.append(sline2[0])
                if sline[0] != "O":
                    labels.add(sline[0])

            #if sline[5] != sline2[5]:
            #    print("mismatching words!")
            #    print(sline[5])
            #    print(sline2[5])
            #    exit()

            i += 1
            j += 1

    labels = list(labels)
    p = precision_score(gold, pred, labels=labels, average="weighted")
    r = recall_score(gold, pred, labels=list(labels), average="weighted")
    f1 = f1_score(gold, pred, labels=list(labels), average="weighted")
    print("Scoring: {} lines out of {}".format(len(pred), total))
    print("SCORES: {} {} {}".format(p, r, f1))
예제 #18
0
def getstats(folders):
    if len(folders) > 2:
        print(">2 folders is not supported. Will only operate on {} and {}"
              .format(folders[0], folders[1]))

    # this will only ever have two elements
    namedicts = []
    tokendicts = []
    tagdicts = []

    for folder in folders:
        files = util.getfnames(folder)
        names = defaultdict(int)
        tags = defaultdict(int)
        tokens = defaultdict(int)
        weights = 0
        nametokens = 0
        sentences = 0

        for f in files:
            cdoc = readconll(f)

            sentences += len(cdoc.sentenceends)

            for t in cdoc.tokens:
                if t.weight > 0:
                    tokens[t.s] += t.weight
                weights += t.weight

            for c in cdoc.getconstituents():
                # assume that all tokens in a constituent share the same weight.
                for t in c.tokens:
                    if t.weight > 0:
                        nametokens += t.weight
                        names[c.label + "\t" + c.surf()] += t.weight
                        tags[c.label] += t.weight

        namedicts.append(names)
        tokendicts.append(tokens)
        tagdicts.append(tags)

        numtokens = sum(tokens.values())
        numnames = sum(names.values())
        uniqnames = len(names.keys())

        try:
            reps = numnames / float(uniqnames)
            uniqs = uniqnames / float(numnames)
            tagpercentage = nametokens / numtokens
            weightedtagpercentage = nametokens / weights
        except ZeroDivisionError:
            reps = 0.0
            uniqs = 0.0
            tagpercentage = 0.
            weightedtagpercentage = 0.

        print("{}: {}".format("Folder", folder))
        print(" {:<20}{:>10}".format("Documents", len(files)))
        print(" {:<20}{:>10,}".format("Num tokens", numtokens))
        print(" {:<20}{:>10,}".format("Num unique tokens", len(tokens.keys())))
        print(" {:<20}{:>10,}".format("Num sentences", sentences))
        print(" {:<20}{:>10,}".format("Num names", numnames))
        print(" {:<20}{:>10,}".format("Num name tokens", nametokens))
        print(" {:<20}{:>10,}".format("Num unique name tokens", uniqnames))
        print(" {:<20}{:>10.2}".format("Avg num repetitions", reps))
        print(" {:<20}{:>10.2}".format("Unique / total", uniqs))
        print(" {:<20}{:>10.2%}".format("Tag %", tagpercentage))
        print(" {:<20}{:>10.2%}".format("Weighted Tag %", weightedtagpercentage))
        print(" Tag dict")
        for t in sorted(tags):
            print("  {}: {} ({:.2%})"
                  .format(t, tags[t], tags[t] / float(numnames)))

    if len(namedicts) == 2:
        print("Comparison:")

        if tagdicts[0].keys() != tagdicts[1].keys():
            print(" ***Mismatching tag set!***")

        namecos, nameinter, nameweight = cos(namedicts[0], namedicts[1])
        vocabcos, vocabinter, vocabweight = cos(tokendicts[0], tokendicts[1])
        vocabkl, inter  = KL(tokendicts[0], tokendicts[1])
        tagcos, taginter, tagweight = cos(tagdicts[0], tagdicts[1])

        print(" Names cos sim: {}".format(namecos))
        print(" Vocab cos sim: {}".format(vocabcos))
        print(" Vocab overlap: {}".format(vocabinter))
        print(" Vocab KL divg: {}".format(vocabkl))
        print(" Tag cos sim: {}".format(tagcos))

        numtestnames = sum(namedicts[1].values())
        print(" %names in test seen in training: {}"
              .format(nameinter / float(numtestnames)))
        
        numtesttokens = sum(tokendicts[1].values())
        print(" %tokens in test seen in training: {}"
              .format(vocabinter / float(numtesttokens)))
예제 #19
0
def func(folder, outfolder, mention, methods, defaultweight):
    """ methods is a list. Can contain: punc, window, softwindow, freq, uniform, random """

    random.seed(1234567)

    fnames = getfnames(folder)
    isfolder = os.path.isdir(folder)

    # word frequencies.
    wfreq = defaultdict(int)
    # make a first pass to gather word
    # frequencies and distances from entities.
    dists = defaultdict(lambda: defaultdict(int))

    # is entity?
    def isent(s):
        return s != "O"

    for fname in fnames:

        # this measures the distance from token i to the nearest
        # named entity

        lastindex = 0

        # boolean for if cond:
        # the previous entity boundary was a sentence
        prevempty = True

        with open(fname) as f:
            lines = f.readlines()

        for i, line in enumerate(lines):
            sline = line.split("\t")
            if len(sline) > 5:
                wfreq[sline[5]] += 1

            ent = isent(sline[0])
            empty = len(sline) < 5
            if ent or empty:
                dists[fname][i] = 0
                # when you see this, then go over all tokens since then...
                dd = i - lastindex

                startbias = 0
                endbias = 0

                if empty and prevempty:
                    # these are just large nubmers...
                    startbias = 10000
                    endbias = 10000
                elif empty:
                    endbias = 10000
                elif prevempty:
                    startbias = 10000

                for j in range(lastindex + 1, i):
                    disttolast = j - lastindex
                    disttonext = dd - disttolast
                    dists[fname][j] = min(disttolast + startbias,
                                          disttonext + endbias)
                lastindex = i
                prevempty = empty

    # normalize the word frequencies
    mx = max(wfreq.values())
    for w in wfreq:
        wfreq[w] /= mx

    for method in methods:
        if len(method) > 0 and method not in allowedmethods:
            print(
                "Warning: {} not a supported method. Ignoring.".format(method))

    for fname in fnames:
        with open(fname) as f:
            lines = f.readlines()
        outlines = []
        for i, line in enumerate(lines):

            sline = line.strip().split("\t")
            if len(sline) > 5:

                sline[7] = str(dists[fname][i])

                if mention:
                    # in this case, we don't want weights.
                    sline[6] = 1.0
                    if sline[0] != "O":
                        sline[0] = "B-MNT"
                elif sline[0] == "O":

                    if defaultweight is not None:
                        sline[6] = defaultweight
                    try:
                        sline[6] = float(sline[6])
                    except Exception:
                        sline[6] = 0.0

                    if "random" in methods:
                        if random.random() < 0.25:
                            sline[6] += 1.0

                    # These all give weights to all methods.
                    if "softwindow" in methods:
                        sline[6] += softwindow(dists[fname][i])

                    if "freq" in methods:
                        sline[6] += freq(wfreq[sline[5]])

                    if "uniform" in methods:
                        sline[6] += uniform()

                    # The following give weights to just a few.
                    if "punc" in methods:
                        if sline[5] in punc or isnum(sline[5]):
                            sline[6] += 1.0

                    if "window" in methods:
                        if dists[fname][i] <= 1:
                            sline[6] += 1.0

                    if sline[6] == "x":
                        sline[6] = 0.0
                else:
                    sline[6] = 1.0

                sline[6] = str(sline[6])
                sline[8] = str(wfreq[sline[5]])
                outlines.append("\t".join(sline) + "\n")
            else:
                outlines.append("\n")

        if isfolder:
            fnonly = os.path.basename(fname)
            outpath = outfolder + "/" + fnonly
        else:
            outpath = outfolder

        with open(outpath, "w") as out:
            for line in outlines:
                out.write(line)
예제 #20
0
def func(folder, outfolder, precision, recall):
    # This is v1 seed:
    random.seed(1234567)
    # v2 seed:
    #random.seed(4343)

    fnames = getfnames(folder)

    # this contains the sets of constituents and frequencies
    d = defaultdict(int)
    
    namesdocs = []
    total = 0
    labels = list()

    for fname in fnames:
        cdoc = readconll(fname)
        cons = cdoc.getconstituents()

        namesdocs.append((fname,cdoc))
        
        for c in cons:
            d[c.surf()] += 1
            total += 1
            labels.append(c.label)

    labels = list(labels)

    # build the set of names we will keep...
    goal = recall * total
    currnum = 0
    activecons = set()

    # impose ordering
    ditems = sorted(d.items())

    # make it random, but consistent.
    random.shuffle(ditems)

    for c, freq in ditems:
        activecons.add(c)
        currnum += freq
        if currnum >= goal:
            break

    print("Writing to {}".format(outfolder))
    for fname,cdoc in namesdocs:
        cons = cdoc.getconstituents()

        numpos = len(cons)
        for con in cons:
            # discard all the names we don't keep.
            if con.surf() not in activecons:
                # print(con)
                cdoc.removeconstituent(con)
                numpos -= 1

        badspanstoadd = math.ceil(numpos / precision - numpos)
        for _ in range(badspanstoadd):
            start = random.randrange(0, len(cdoc.tokens)-5)
            length = random.randrange(1, 3)
            end = start + length
            randlabel = random.choice(labels)
            newcon = Constituent(randlabel, cdoc.tokens[start:end], (start, end))
            cdoc.addconstituent(newcon)

        with open(outfolder + "/" + os.path.basename(fname), "w") as out:
            cdoc.write(out)
예제 #21
0
def func(fof1, fof2, ignore=False):
    print("THIS ONLY RETURNS TOKEN LEVEL")
    fnames1 = sorted(getfnames(fof1))
    fnames2 = sorted(getfnames(fof2))

    labels = set()
    gold = []
    pred = []
    for f1, f2 in zip(fnames1, fnames2):
        print(f1, f2)
        
        try:
            with open(f1) as f:
                lines = f.readlines()

            with open(f2) as f:
                lines2 = f.readlines()
        except IOError as e:
            print(e)
            continue

        i = 0
        j = 0
        total = 0
        while i < len(lines) and j < len(lines2):
            sline = lines[i].split("\t")
            sline2 = lines2[j].split("\t")
            
            try:
                while "-DOCSTART-" in lines[i] or lines[i].strip() == "":
                    i += 1
                    sline = lines[i].split("\t")

                while "-DOCSTART-" in lines2[j] or lines2[j].strip() == "":
                    j += 1
                    sline2 = lines2[j].split("\t")
            except IndexError:
                break
            
            if len(sline) < 5:
                continue

            try:
                predweight = sline2[6]
            except Exception:
                predweight = 1.0
                
            total += 1
            if ignore and float(predweight) == 0.0:
                pass
            else:
                gold.append(sline[0])
                pred.append(sline2[0])
                if sline[0] != "O":
                    labels.add(sline[0])
            
            #if sline[5] != sline2[5]:
            #    print("mismatching words!")
            #    print(sline[5])
            #    print(sline2[5])
            #    exit()

            i += 1
            j += 1

    labels = list(labels)
    p = precision_score(gold, pred, labels=labels, average="weighted")
    r = recall_score(gold, pred, labels=list(labels), average="weighted")
    f1 = f1_score(gold, pred, labels=list(labels), average="weighted")
    print("Scoring: {} lines out of {}".format(len(pred), total))
    print("SCORES: {} {} {}".format(p,r,f1))