Пример #1
0
def run(dataset):

    COUNT = 40000
    cut = int((COUNT / 2) * 3 / 4)
    array = [[True, 7000], [True, 9000], [True, 11000], [True, 14000]]

    nlt = dict()
    skl = dict()

    # file
    for variable in array:
        var_name = str(variable[0]) + str(variable[1])
        if NLTK:
            nlt_file = "bigram15-comb-" + dataset + "-" + var_name + "-nlt.csv"
            nlt[var_name] = open(nlt_file, 'a')
            nlt[var_name].write(
                str(datetime.datetime.today()) + " COUNT= " + str(COUNT) +
                "\n")

        if SKLEARN:
            skl_file = "bigram15-comb-" + dataset + "-" + var_name + "-skl.csv"
            skl[var_name] = open(skl_file, 'a')
            skl[var_name].write(
                str(datetime.datetime.today()) + " COUNT= " + str(COUNT) +
                "\n")

    # cycle
    for x in range(0, 5):
        print(x)
        corpora = crp.Corpora(dataset, count=COUNT, shuffle=True)

        for variable in array:
            print(str(variable[1]))
            var_name = str(variable[0]) + str(variable[1])
            features = ftr.Features(corpora,
                                    total=COUNT,
                                    bigram=variable[0],
                                    bigram_count=15,
                                    inf_count=variable[1])

            posfeats = features.get_features_pos()
            negfeats = features.get_fearures_neg()

            trainfeats = negfeats[:cut] + posfeats[:cut]
            testfeats = negfeats[cut:] + posfeats[cut:]

            nlt_output, skl_output = cls.train(trainfeats,
                                               testfeats,
                                               nlt=NLTK,
                                               skl=SKLEARN)

            if NLTK:
                print(str(nlt_output))
                nlt[var_name].write(nlt_output)
                nlt[var_name].flush()
            if SKLEARN:
                skl[var_name].write(skl_output)
                skl[var_name].flush()
Пример #2
0
def run(dataset):

    nlt = dict()
    skl = dict()

    dir = "output/" + dataset + "/" + type + "/"
    os.makedirs(dir, exist_ok=True)

    # file
    for variable in array:
        var_name = str(variable)

        if nltk_run:
            nlt_file = dir + dataset + "-" + type + "-" + var_name + "-nlt.csv"
            nlt[var_name] = open(nlt_file, 'a')
            nlt[var_name].write(str(datetime.datetime.today()) + "\n")

        if sklearn_run:
            skl_file = dir + dataset + "-" + type + "-" + var_name + "-skl.csv"
            skl[var_name] = open(skl_file, 'a')
            skl[var_name].write(str(datetime.datetime.today()) + "\n")

    # cycle
    for x in range(0, 10):
        print(x)


        for variable in array:
            print(str(variable))
            var_name = str(variable)

            COUNT = variable
            cut = int((COUNT / 2) * 4 / 5)

            corpora = crp.Corpora(dataset, count=COUNT, shuffle=True)
            features = ftr.Features(corpora, total=COUNT, inf_count=-1)

            posfeats = features.get_features_pos()
            negfeats = features.get_fearures_neg()

            trainfeats = negfeats[:cut] + posfeats[:cut]
            testfeats = negfeats[cut:] + posfeats[cut:]

            try:
                nlt_output, skl_output = cls.train(trainfeats, testfeats, nlt=nltk_run, skl=sklearn_run)
            except Exception as e:
                print(e)
                continue

            if nltk_run:
                print(str(nlt_output))
                nlt[var_name].write(nlt_output)
                nlt[var_name].flush()
            if sklearn_run:
                print(str(skl_output))
                skl[var_name].write(skl_output)
                skl[var_name].flush()
Пример #3
0
def run(dataset):

    nlt = dict()
    skl = dict()

    dir = "output/" + dataset + "/" + type + "/"
    os.makedirs(dir, exist_ok=True)

    # file
    for variable in array:
        var_name = str(variable)

        if nltk_run:
            nlt_file = dir + dataset + "-" + type + "-" + var_name + "-nlt.csv"
            nlt[var_name] = open(nlt_file, 'a')
            nlt[var_name].write(str(datetime.datetime.today()) + "\n")

        if sklearn_run:
            skl_file = dir + dataset + "-" + type + "-" + var_name + "-skl.csv"
            skl[var_name] = open(skl_file, 'a')
            skl[var_name].write(str(datetime.datetime.today()) + "\n")

    # cycle
    for x in range(0, 10):
        print(x)
        corpora = crp.Corpora(dataset, count=COUNT, shuffle=True)

        for variable in array:
            print(str(variable))
            var_name = str(variable)
            features = ftr.Features(corpora,
                                    total=COUNT,
                                    bigram=True,
                                    stop=True,
                                    stem="porter",
                                    lower=True,
                                    inf_count=variable)

            posfeats = features.get_features_pos()
            negfeats = features.get_fearures_neg()

            trainfeats = negfeats[:cut] + posfeats[:cut]
            testfeats = negfeats[cut:] + posfeats[cut:]

            nlt_output, skl_output = cls.train(trainfeats,
                                               testfeats,
                                               dataset,
                                               nlt=nltk_run,
                                               skl=sklearn_run)
Пример #4
0
def run(dataset):

    COUNT = 5000
    cut = int((COUNT / 2) * 3 / 4)

    nlt = dict()
    skl = dict()

    dir = "output/" + dataset + "/pos/"
    os.makedirs(dir, exist_ok=True)

    # cycle
    for x in range(0, 10):
        print(x)
        corpora = crp.Corpora(dataset, count=COUNT, shuffle=True)

        #####

        var = "NONE"
        nlt_file = dir + dataset + "-" + "pos" + "-" + var + "-nlt.csv"
        # skl_file = "output/" + dataset + "/" + dataset + "-" + "pos" + "-" + var + "-skl.csv"
        nlt[var] = open(nlt_file, 'a')
        # skl[var] = open(skl_file, 'a')

        features = ftr.Features(corpora, total=COUNT, pos=None)

        posfeats = features.get_features_pos()
        negfeats = features.get_fearures_neg()

        trainfeats = negfeats[:cut] + posfeats[:cut]
        testfeats = negfeats[cut:] + posfeats[cut:]

        nlt_output, skl_output = cls.train(trainfeats, testfeats, skl=False)

        nlt[var].write(nlt_output)
        nlt[var].flush()
        print(str(nlt_output))
        # skl[var].write(skl_output)

        #####

        var = "JVNR"

        nlt_file = dir + dataset + "-" + "pos" + "-" + var + "-nlt.csv"
        # skl_file = "output/" + dataset + "/" + dataset + "-" + "pos" + "-" + var + "-skl.csv"
        nlt[var] = open(nlt_file, 'a')
        # skl[var] = open(skl_file, 'a')

        features = ftr.Features(corpora, total=COUNT, pos=["J", "V", "N", "R"])

        posfeats = features.get_features_pos()
        negfeats = features.get_fearures_neg()

        trainfeats = negfeats[:cut] + posfeats[:cut]
        testfeats = negfeats[cut:] + posfeats[cut:]

        nlt_output, skl_output = cls.train(trainfeats, testfeats, skl=False)

        nlt[var].write(nlt_output)
        nlt[var].flush()
        print(str(nlt_output))
        # skl[var].write(skl_output)

        #####

        var = "EUJVNR"

        nlt_file = dir + dataset + "-" + "pos" + "-" + var + "-nlt.csv"
        # skl_file = "output/" + dataset + "/" + dataset + "-" + "pos" + "-" + var + "-skl.csv"
        nlt[var] = open(nlt_file, 'a')
        # skl[var] = open(skl_file, 'a')

        features = ftr.Features(corpora,
                                total=COUNT,
                                pos=["E", "U", "J", "V", "N", "R"])

        posfeats = features.get_features_pos()
        negfeats = features.get_fearures_neg()

        trainfeats = negfeats[:cut] + posfeats[:cut]
        testfeats = negfeats[cut:] + posfeats[cut:]

        nlt_output, skl_output = cls.train(trainfeats, testfeats, skl=False)

        nlt[var].write(nlt_output)
        nlt[var].flush()
        print(str(nlt_output))
        # skl[var].write(skl_output)

        #####

        var = "FEUPJVNR"

        nlt_file = dir + dataset + "-" + "pos" + "-" + var + "-nlt.csv"
        # skl_file = "output/" + dataset + "/" + dataset + "-" + "pos" + "-" + var + "-skl.csv"
        nlt[var] = open(nlt_file, 'a')
        # skl[var] = open(skl_file, 'a')

        features = ftr.Features(corpora,
                                total=COUNT,
                                pos=["F", "E", "U", "P", "J", "V", "N", "R"])

        posfeats = features.get_features_pos()
        negfeats = features.get_fearures_neg()

        trainfeats = negfeats[:cut] + posfeats[:cut]
        testfeats = negfeats[cut:] + posfeats[cut:]

        nlt_output, skl_output = cls.train(trainfeats, testfeats, skl=False)

        nlt[var].write(nlt_output)
        nlt[var].flush()
        print(str(nlt_output))
Пример #5
0
def run_test(dataset,
             type,
             iter=10,
             count=5000,
             shuffle=False,
             nltk_run=True,
             sklearn_run=True,
             inf_count=-1,
             bigram_count=50,
             pos=None,
             stop=False,
             stem="none",
             bigram=False,
             lower=True):

    cut = int((count / 2) * 3 / 4)

    nlt = dict()
    skl = dict()

    # file
    for variable in array:
        var_name = str(variable)

        if nltk_run:
            nlt_file = "output/" + dataset + "/" + dataset + "-" + type + "-" + var_name + "-nlt.csv"
            nlt[var_name] = open(nlt_file, 'a')
            nlt[var_name].write(str(datetime.datetime.today()) + "\n")

        if sklearn_run:
            skl_file = "output/" + dataset + "/" + dataset + "-" + type + "-" + var_name + "-skl.csv"
            skl[var_name] = open(skl_file, 'a')
            skl[var_name].write(str(datetime.datetime.today()) + "\n")

    # cycle
    for x in range(0, iter):
        print(x)
        corpora = crp.Corpora(dataset, count, shuffle)

        for variable in array:
            print(str(variable[1]))
            var_name = str(variable[0]) + str(variable[1])
            features = ftr.Features(corpora, count, inf_count, bigram_count,
                                    pos, stop, stem, bigram, lower)

            posfeats = features.get_features_pos()
            negfeats = features.get_fearures_neg()

            trainfeats = negfeats[:cut] + posfeats[:cut]
            testfeats = negfeats[cut:] + posfeats[cut:]

            nlt_output, skl_output = cls.train(trainfeats,
                                               testfeats,
                                               nlt=nltk_run,
                                               skl=sklearn_run)

            if nltk_run:
                print(str(nlt_output))
                nlt[var_name].write(nlt_output)
                nlt[var_name].flush()
            if sklearn_run:
                print(str(nlt_output))
                skl[var_name].write(skl_output)
                nlt[var_name].flush()


# dataset_array = ["stwits"]
#
# for dataset in dataset_array:
#     run(dataset)
Пример #6
0
    sklearn_output = "skl, " + str(mnb) + ", " + str(bnb) + ", " + str(
        lr) + ", " + str(lsvc) + ", " + str(nsvc) + ", " + str(voted) + "\n"

    return (nltk_output, sklearn_output)


if __name__ == '__main__':
    COUNT = 5000
    cut = int((COUNT / 2) * 4 / 5)

    corpora = crp.Corpora("stwits", count=COUNT, shuffle=True)
    features = ftr.Features(corpora,
                            total=COUNT,
                            stem="porter",
                            bigram=True,
                            stop=True,
                            inf_count=-1,
                            lower=True)
    # features = ftr.Features(corpora, total=COUNT, bigram=True, stem="porter")

    posfeats = features.get_features_pos()
    negfeats = features.get_fearures_neg()

    trainfeats = negfeats[:cut] + posfeats[:cut]
    testfeats = negfeats[cut:] + posfeats[cut:]

    print('train on %d instances, test on %d instances' %
          (len(trainfeats), len(testfeats)))
    nlt, skl = train(trainfeats, testfeats, skl=False, most=50)
    print(nlt, skl)
Пример #7
0
        neg_prec) + ", " + str(pos_rec) + ", " + str(neg_rec) + "\n"
    sklearn_output = "skl, " + str(mnb) + ", " + str(bnb) + ", " + str(
        lr) + ", " + str(lsvc) + ", " + str(nsvc) + ", " + str(voted) + "\n"

    return (nltk_output, sklearn_output)


if __name__ == '__main__':
    COUNT = 25000
    cut = int((COUNT / 2) * 4 / 5)

    corpora = crp.Corpora("stanford", count=COUNT, shuffle=True)
    features = ftr.Features(corpora,
                            total=COUNT,
                            bigram=True,
                            stem="porter",
                            stop=False,
                            lower=True,
                            inf_count=5000)
    features_f = open("pickled/features.pickle", "wb")
    pickle.dump(features, features_f)
    features_f.close()

    posfeats = features.get_features_pos()
    negfeats = features.get_fearures_neg()

    trainfeats = negfeats[:cut] + posfeats[:cut]
    testfeats = negfeats[cut:] + posfeats[cut:]

    print('train on %d instances, test on %d instances' %
          (len(trainfeats), len(testfeats)))