示例#1
0
def generate_sequential_data11(lst, lstv=None, pos="V"):
    corpus = []
    all_data = []
    map = Mapping()
    for fn in lst:
        txt = read_conll2009_corpus(fn)
        corpus.extend(txt)

    for sen in corpus:
        for w in sen:
            if isinstance(w, Predicate) and getattr(w, "pos")[0] == pos:
                lst = [map.add_value(w.form + "_" + w.lemma + "." + w.sense)]
                arglst = []
                for arg in w.arguments:
                    hn = get_represented_form(sen, arg)
                    if hn is not None:
                        arglst.append(
                            map.add_value(hn + "_" + w.arguments[arg]))
                arglst.append(map.add_value("EOS"))
                lst.extend(arglst)
                all_data.append(lst)
    X = [[x[i] for i in range(len(x) - 1)] for x in all_data]
    Y = [[x[i + 1] for i in range(len(x) - 1)] for x in all_data]
    Xv = None
    Yv = None
    if lstv is not None:
        corpusv = []
        all_datav = []
        for fn in lstv:
            txt = read_conll2009_corpus(fn)
            corpusv.extend(txt)

        for sen in corpusv:
            for w in sen:
                if isinstance(w, Predicate) and getattr(w, "pos")[0] == pos:
                    lst = [
                        map.add_value(w.form + "_" + w.lemma + "." + w.sense)
                    ]
                    arglst = []
                    for arg in w.arguments:
                        hn = get_represented_form(sen, arg)
                        if hn is not None:
                            arglst.append(
                                map.add_value(hn + "_" + w.arguments[arg]))
                    arglst.append(map.add_value("EOS"))
                    lst.extend(arglst)
                    all_datav.append(lst)
        Xv = [[x[i] for i in range(len(x) - 1)] for x in all_datav]
        Yv = [[x[i + 1] for i in range(len(x) - 1)] for x in all_datav]
    return X, Y, Xv, Yv, map
示例#2
0
def generate_sequential_data21_getmap(lst, lstv=None, pos="V", data="form"):
    corpus = []
    mapX1 = Mapping()
    mapX2 = Mapping()
    mapY1 = Mapping()
    for fn in lst:
        txt = read_conll2009_corpus(fn)
        corpus.extend(txt)

    for sen in corpus:
        for w in sen:
            if isinstance(w, Predicate) and getattr(w, "pos")[0] == pos:

                mapX1.add_value(w.form)
                mapX2.add_value("PRED")
                mapY1.add_value(w.form + "_PRED")

                for arg in w.arguments:
                    if data == "origin":
                        hn = sen[arg].form
                    else:
                        hn = get_represented_form(sen, arg)
                    if hn is not None:
                        mapX1.add_value(hn)
                        mapX2.add_value(w.arguments[arg])
                        mapY1.add_value(hn + "_" + w.arguments[arg])
                mapX1.add_value("EOS")
                mapX2.add_value("EOS")
                mapY1.add_value("EOS_EOS")

    return mapX1, mapX2, mapY1
示例#3
0
def generate_sequential_data11(lst,
                               lstv=None,
                               pos="V",
                               data="form",
                               count=100000):
    corpus = []
    all_data = []
    map = Mapping()
    for i in range(1, len(lst)):
        txt = read_conll2009_corpus(lst[i])
        corpus.extend(txt)
    if count is not None:
        c = 0
        corpus_select = []
        for s in corpus:
            c += 1
            if c < count:
                corpus_select.append(s)
            else:
                break
        corpus = corpus_select
    corpus.extend(read_conll2009_corpus(lst[0]))
    for sen in corpus:
        for w in sen:
            if isinstance(w, Predicate) and getattr(w, "pos")[0] == pos:
                lst = [map.add_value(w.form + "_PRED")]
                arglst = []
                for arg in w.arguments:
                    if data == "origin":
                        hn = sen[arg].form
                    else:
                        hn = get_represented_form(sen, arg)
                    if hn is not None:
                        arglst.append(
                            map.add_value(hn + "_" + w.arguments[arg]))
                arglst.append(map.add_value("EOS"))
                lst.extend(arglst)
                all_data.append(lst)
    X = [[x[i] for i in range(len(x) - 1)] for x in all_data]
    Y = [[x[i + 1] for i in range(len(x) - 1)] for x in all_data]
    Xv = None
    Yv = None
    if lstv is not None:
        corpusv = []
        all_datav = []
        for fn in lstv:
            txt = read_conll2009_corpus(fn)
            corpusv.extend(txt)

        for sen in corpusv:
            for w in sen:
                if isinstance(w, Predicate) and getattr(w, "pos")[0] == pos:
                    lst = [map.add_value(w.form + "_PRED")]
                    arglst = []
                    for arg in w.arguments:
                        if data == "origin":
                            hn = sen[arg].form
                        else:
                            hn = get_represented_form(sen, arg)
                        if hn is not None:
                            arglst.append(
                                map.add_value(hn + "_" + w.arguments[arg]))
                    arglst.append(map.add_value("EOS"))
                    lst.extend(arglst)
                    all_datav.append(lst)
        Xv = [[x[i] for i in range(len(x) - 1)] for x in all_datav]
        Yv = [[x[i + 1] for i in range(len(x) - 1)] for x in all_datav]
    return X, Y, Xv, Yv, map
示例#4
0
def generate_sequential_data21(corpus,
                               lstv=None,
                               pos="V",
                               data="form",
                               mapX1=None,
                               mapX2=None,
                               mapY1=None):

    all_data = []

    if mapX1 is not None:
        for sen in corpus:
            for w in sen:
                if isinstance(w, Predicate) and getattr(w, "pos")[0] == pos:

                    lst = [(mapX1.get_index(w.form), mapX2.get_index("PRED"),
                            mapY1.get_index(w.form + "_PRED"))]
                    arglst = []
                    for arg in w.arguments:
                        if data == "origin":
                            hn = sen[arg].form
                        else:
                            hn = get_represented_form(sen, arg)
                        if hn is not None:
                            arglst.append(
                                (mapX1.get_index(hn),
                                 mapX2.get_index(w.arguments[arg]),
                                 mapY1.get_index(hn + "_" + w.arguments[arg])))
                    arglst.append(
                        (mapX1.get_index("EOS"), mapX2.get_index("EOS"),
                         mapY1.get_index("EOS_EOS")))
                    lst.extend(arglst)
                    all_data.append(lst)

        X1 = [[x[i][0] for i in range(len(x) - 1)] for x in all_data]
        X2 = [[x[i][1] for i in range(len(x) - 1)] for x in all_data]
        Y = [[x[i + 1][2] for i in range(len(x) - 1)] for x in all_data]
        Xv1 = None
        Xv2 = None
        Yv = None
        if lstv is not None:
            corpusv = []
            all_datav = []
            for fn in lstv:
                txt = read_conll2009_corpus(fn)
                corpusv.extend(txt)

            for sen in corpusv:
                for w in sen:
                    if isinstance(w, Predicate) and getattr(w,
                                                            "pos")[0] == pos:

                        lst = [(mapX1.get_index(w.form),
                                mapX2.get_index("PRED"),
                                mapY1.get_index(w.form + "_PRED"))]
                        arglst = []
                        for arg in w.arguments:
                            if data == "origin":
                                hn = sen[arg].form
                            else:
                                hn = get_represented_form(sen, arg)
                            if hn is not None:
                                arglst.append(
                                    (mapX1.get_index(hn),
                                     mapX2.get_index(w.arguments[arg]),
                                     mapY1.get_index(hn + "_" +
                                                     w.arguments[arg])))
                        arglst.append(
                            (mapX1.get_index("EOS"), mapX2.get_index("EOS"),
                             mapY1.get_index("EOS_EOS")))
                        lst.extend(arglst)
                        all_datav.append(lst)
            Xv1 = [[x[i][0] for i in range(len(x) - 1)] for x in all_datav]
            Xv2 = [[x[i][1] for i in range(len(x) - 1)] for x in all_datav]
            Yv = [[x[i + 1][2] for i in range(len(x) - 1)] for x in all_datav]
        return X1, X2, Y, Xv1, Xv2, Yv, mapX1, mapX2, mapY1

    else:
        mapX1 = Mapping()
        mapX2 = Mapping()
        mapY1 = Mapping()
        for txttrain in corpus:
            all_data.extend()
        for sen in corpus:
            for w in sen:
                if isinstance(w, Predicate) and getattr(w, "pos")[0] == pos:

                    lst = [(mapX1.add_value(w.form), mapX2.add_value("PRED"),
                            mapY1.add_value(w.form + "_PRED"))]
                    arglst = []
                    for arg in w.arguments:
                        if data == "origin":
                            hn = sen[arg].form
                        else:
                            hn = get_represented_form(sen, arg)
                        if hn is not None:
                            arglst.append(
                                (mapX1.add_value(hn),
                                 mapX2.add_value(w.arguments[arg]),
                                 mapY1.add_value(hn + "_" + w.arguments[arg])))
                    arglst.append(
                        (mapX1.add_value("EOS"), mapX2.add_value("EOS"),
                         mapY1.add_value("EOS_EOS")))
                    lst.extend(arglst)
                    all_data.append(lst)

        X1 = [[x[i][0] for i in range(len(x) - 1)] for x in all_data]
        X2 = [[x[i][1] for i in range(len(x) - 1)] for x in all_data]
        Y = [[x[i + 1][2] for i in range(len(x) - 1)] for x in all_data]
        Xv1 = None
        Xv2 = None
        Yv = None
        if lstv is not None:
            corpusv = []
            all_datav = []
            for fn in lstv:
                txt = read_conll2009_corpus(fn)
                corpusv.extend(txt)

            for sen in corpusv:
                for w in sen:
                    if isinstance(w, Predicate) and getattr(w,
                                                            "pos")[0] == pos:

                        lst = [(mapX1.add_value(w.form),
                                mapX2.add_value("PRED"),
                                mapY1.add_value(w.form + "_PRED"))]
                        arglst = []
                        for arg in w.arguments:
                            if data == "origin":
                                hn = sen[arg].form
                            else:
                                hn = get_represented_form(sen, arg)
                            if hn is not None:
                                arglst.append(
                                    (mapX1.add_value(hn),
                                     mapX2.add_value(w.arguments[arg]),
                                     mapY1.add_value(hn + "_" +
                                                     w.arguments[arg])))
                        arglst.append(
                            (mapX1.add_value("EOS"), mapX2.add_value("EOS"),
                             mapY1.add_value("EOS_EOS")))
                        lst.extend(arglst)
                        all_datav.append(lst)
            Xv1 = [[x[i][0] for i in range(len(x) - 1)] for x in all_datav]
            Xv2 = [[x[i][1] for i in range(len(x) - 1)] for x in all_datav]
            Yv = [[x[i + 1][2] for i in range(len(x) - 1)] for x in all_datav]
        return X1, X2, Y, Xv1, Xv2, Yv, mapX1, mapX2, mapY1
示例#5
0
 def readAll(self):
     txt = []
     for f in self.input_file:
         txt=read_conll2009_corpus(f)
     return txt
示例#6
0
                words.append(l.strip())
        if len(words) != 0:
            if end is None:
                if idx >= start:
                    sens.append(words)

            else:

                if idx >= start:
                    if idx < end:
                        sens.append(words)

        for sen in sens:
            conll2009sen = read_conll2009_sentence(sen, read_label, use_gold=use_gold)
            txt.append(conll2009sen)
        return txt

if __name__ == "__main__":
    lst = ["/home/quynh/working/Data/conll2009/train.conll2009.pp.txt"]
    reader = Conll2009BatchReader(1000, lst)
    count = 0
    while  True:
        txt =  reader.next()
        if len(txt) == 0:
            break
        count+=len(txt)

    print (count)

    txt =  read_conll2009_corpus("/home/quynh/working/Data/conll2009/train.conll2009.pp.txt")
    print (len(txt))