Exemplo n.º 1
0
import util

" PARAMETERS "
charstop = False  # True means label attributes to previous char
" END OF PARAMETERS "

"python cpr.py 'qualitative/allover-sjw-gold.txt' 'qualitative/allover-sjw-me.txt' 1"
args = sys.argv
if len(args) > 1:
    material1 = args[1]
    material2 = args[2]
    charstop = int(args[3])

# Prepare li: list of random lines
print "Reading from files..."
gold = [line for line in util.file_to_lines(glob.glob(material1))]
out = [line for line in util.file_to_lines(glob.glob(material2))]

golddata = []
for line in gold:
    golddata.append(util.line_toseq(line, charstop))

outdata = []
for line in out:
    outdata.append(util.line_toseq(line, charstop))

# testdata shape: [([x1, x2, ...],[y1,y2,...]),([],[])]

results = []
assert len(golddata) == len(outdata)
for i in range(len(golddata)):
Exemplo n.º 2
0
# -*- coding: utf8 -*-
import util
import sys
import glob

material = '../data/24s/*'
#material = 'data/24s/*'
i = 0
for line in util.file_to_lines(glob.glob(material)):
    #i = i+1
    #print (" ".join(util.line_toraw(line)).encode('utf8'))
    print(" ".join(util.line_toraw(line)))
#print i
Exemplo n.º 3
0
modelname = material.replace('/','').replace('*','')+str(size)+"glove50"
validate_interval = 10000
hidden_size = 50
learning_rate = 0.001
random.seed(101)

print "Material:", material
print "Size:", size, "entries,", trainportion, "as training", validateportion, "as validation"
print "Dense:", dense
print "charstop:", charstop

starttime = datetime.datetime.now()
print "Starting Time:",starttime

print "Preparing text..."
li = [line for line in util.file_to_lines(glob.glob(material))]
random.shuffle(li)
li = li[:size]

print "Preparing dictionaries..."
if dense: vdict = util.lstmvec(dictfile)
else: charset = util.make_charset(li,7)

print "Preparing datasets..."

dataset_train = li[:cut1]
dataset_validate = li[cut1:cut2]
dataset_test = li[cut2:]

dataset = []
while dataset_train:
modelname = material.replace('/', '').replace('*', '') + str(size) + "glove50"
validate_interval = 10000
hidden_size = 50
learning_rate = 0.001
random.seed(101)

print "Material:", material
print "Size:", size, "entries,", trainportion, "as training", validateportion, "as validation"
print "Dense:", dense
print "charstop:", charstop

starttime = datetime.datetime.now()
print "Starting Time:", starttime

print "Preparing text..."
li = [line for line in util.file_to_lines(glob.glob(material))]
random.shuffle(li)
li = li[:size]

print "Preparing dictionaries..."
if dense: vdict = util.lstmvec(dictfile)
else: charset = util.make_charset(li, 7)

print "Preparing datasets..."

dataset_train = li[:cut1]
dataset_validate = li[cut1:cut2]
dataset_test = li[cut2:]

dataset = []
while dataset_train:
Exemplo n.º 5
0
def buildCrf(inputtext):
    material = inputtext
    #material = 'data/24s/*'
    #material = "data/sjw/A05*"
    filename = 'model'
    size = 80
    trainportion = 0.9
    dictfile = 'data/vector/24scbow300.txt'
    crfmethod = "l2sgd"  # {‘lbfgs’, ‘l2sgd’, ‘ap’, ‘pa’, ‘arow’}
    charstop = True  # True means label attributes to previous char
    features = 1  # 1=discrete; 2=vectors; 3=both
    random.seed(101)

    #宣告指令式
    "python runcrf.py 'data/sjw/*' 80 data/vector/vectors300.txt 1 1"
    args = sys.argv
    '''
    if len(args)>1:
        material = args[1]
        size = int(args[2])
        dictfile = args[3]
        features = int(args[4])
        charstop = int(args[5])
    '''
    cut = int(size * trainportion)

    #訓練模型名稱
    modelname = filename.replace('/', '').replace(
        '*', '') + str(size) + str(charstop) + ".m"
    print(modelname)
    print("Material:", material)
    print("Size:", size, "entries,", trainportion, "as training")

    print(datetime.datetime.now())

    # Prepare li: list of random lines
    if features > 1:
        vdict = util.readvec(dictfile)  #先處理文本
        print("Dict:", dictfile)
    li = [line for line in util.file_to_lines(glob.glob(material))]  #已經切成陣列
    random.shuffle(li)  #做亂數取樣
    li = li[:size]

    # Prepare data: list of x(char), y(label) sequences
    data = []

    for line in li:
        x, y = util.line_toseq(line, charstop)
        #print(x)
        #print(y[:5])

        #這邊在做文本做gram
        if features == 1:
            d = crf.x_seq_to_features_discrete(x, charstop), y
        elif features == 2:
            d = crf.x_seq_to_features_vector(x, vdict, charstop), y
        elif features == 3:
            d = crf.x_seq_to_features_both(x, vdict, charstop), y
        data.append(d)

    traindata = data[:cut]
    testdata = data[cut:]
    #print(traindata)

    trainer = pycrfsuite.Trainer()
    #print trainer.params()
    #print(traindata[0])
    for t in traindata:
        x, y = t

        trainer.append(x, y)

    trainer.select(crfmethod)  #做訓練
    trainer.set('max_iterations', 10)  #測試迴圈
    #trainer.set('delta',0)
    #print ("!!!!before train", datetime.datetime.now())
    trainer.train(modelname)
    #print ("!!!!after train", datetime.datetime.now())

    tagger = pycrfsuite.Tagger()
    #建立訓練模型檔案
    tagger.open(modelname)
    tagger.dump(modelname + ".txt")

    print(datetime.datetime.now())
    print("Start closed testing...")
    results = []
    print(traindata)
    while traindata:
        x, yref = traindata.pop()
        yout = tagger.tag(x)
        pr = tagger.marginal('S', 0)
        pp = tagger.probability(yout)
        results.append(util.eval(yref, yout, "S"))

    tp, fp, fn, tn = zip(*results)
    tp, fp, fn, tn = sum(tp), sum(fp), sum(fn), sum(tn)

    p, r = tp / (tp + fp), tp / (tp + fn)
    print("Total tokens in Train Set:", tp + fp + fn + tn)
    print("Total S in REF:", tp + fn)
    print("Total S in OUT:", tp + fp)
    print("Presicion:", p)
    print("Recall:", r)
    print("*******************F1-score:", 2 * p * r / (p + r))
    print("*******************:", pr)
    print("*******************:", pp)
    print("*******************:", yout)
    print(datetime.datetime.now())

    return (modelname)
Exemplo n.º 6
0
cut = int(size * trainportion)

#訓練模型名稱
modelname = material.replace('/', '').replace(
    '*', '') + str(size) + str(charstop) + ".m"

print("Material:", material)
print("Size:", size, "entries,", trainportion, "as training")

print(datetime.datetime.now())

# Prepare li: list of random lines
if features > 1:
    vdict = util.readvec(dictfile)  #先處理文本
    print("Dict:", dictfile)
li = [line for line in util.file_to_lines(glob.glob(material))]  #已經切成陣列
random.shuffle(li)  #做亂數取樣
print(len(li))
#li = li[:size]

# Prepare data: list of x(char), y(label) sequences
data = []

for line in li:
    x, y = util.line_toseq(line, charstop)
    #print(x)
    #print(y[:5])

    #這邊在做文本做gram
    if features == 1:
        d = crf.x_seq_to_features_discrete(x, charstop, 1), y
Exemplo n.º 7
0
"python runhmm.py 'data/sjw/*' 80 1"
args = sys.argv
if len(args)>1:
    material = args[1]
    size = int(args[2])
    charstop = int(args[3])
cut = int(size*trainportion)

print "Material:", material
print "Size:", size, "entries,", trainportion, "as training"

print "Starting Time:",datetime.datetime.now()

# Prepare li: list of random lines
print "Reading from files..."
li = [line for line in util.file_to_lines(glob.glob(material))]
random.shuffle(li)
li = li[:size]

# Prepare data: list of x(char), y(label) sequences
print "Prepare list of sequences..."

closetestdata = li[:cut]
testdata = li[cut:]

traindata = []
for line in closetestdata:
    x, y = util.line_toseq(line, charstop)
    traindata.append(zip(x,y))

# traindata shape: [[(x,y),(x,y), ...],[],[],...]
Exemplo n.º 8
0
import util

" PARAMETERS "
charstop = False # True means label attributes to previous char
" END OF PARAMETERS "

"python cpr.py 'qualitative/allover-sjw-gold.txt' 'qualitative/allover-sjw-me.txt' 1"
args = sys.argv
if len(args)>1:
    material1 = args[1]
    material2 = args[2]
    charstop = int(args[3])

# Prepare li: list of random lines
print "Reading from files..."
gold = [line for line in util.file_to_lines(glob.glob(material1))]
out = [line for line in util.file_to_lines(glob.glob(material2))]


golddata = []
for line in gold:
    golddata.append(util.line_toseq(line, charstop))

outdata = []
for line in out:
    outdata.append(util.line_toseq(line, charstop))

# testdata shape: [([x1, x2, ...],[y1,y2,...]),([],[])]

results = []
assert len(golddata)==len(outdata)
Exemplo n.º 9
0
args = sys.argv
if len(args)>1:
    material = args[1]
    size = int(args[2])
    charstop = int(args[3])
    hu = args[4]
cut = int(size*trainportion)

print "Material:", material
print "Size:", size, "entries,", trainportion, "as training"

print "Starting Time:",datetime.datetime.now()

# Prepare li: list of random lines
print "Reading from files..."
li = [line for line in util.file_to_lines(glob.glob(material))]
random.shuffle(li)
li = li[:size]

# Prepare data: list of x(char), y(label) sequences
print "Prepare list of sequences..."

closetestdata = li[:cut]

traindata = []
for line in closetestdata:
    x, y = util.line_toseq(line, charstop)
    traindata.append(zip(x,y))

# traindata shape: [[(x,y),(x,y), ...],[],[],...]
# testdata shape: [([x1, x2, ...],[y1,y2,...]),([],[])]
Exemplo n.º 10
0
args = sys.argv
if len(args)>1:
    material = args[1]
    size = int(args[2])
    charstop = int(args[3])
    hu = args[4]
cut = int(size*trainportion)

print ("Material:", material)
print ("Size:", size, "entries,", trainportion, "as training")

print ("Starting Time:",datetime.datetime.now())

# Prepare li: list of random lines
print ("Reading from files...")
li = [line for line in util.file_to_lines(glob.glob(material))]
random.shuffle(li)
li = li[:size]

# Prepare data: list of x(char), y(label) sequences
print ("Prepare list of sequences...")

closetestdata = li[:cut]

traindata = []

for line in closetestdata:
    x, y = util.line_toseq(line, charstop)
    traindata.append(zip(x,y))
    
# traindata shape: [[(x,y),(x,y), ...],[],[],...]