示例#1
0
    def __init__(self,mode = "None", use = "TweetData", combine_embeddings=False, BATCHSIZE=64.0):
        #use = "1000Kalimat", "TweetData"        
        self.mode = mode
        self.use = use
        self.combine_embeddings = combine_embeddings
        self.BATCHSIZE = BATCHSIZE
        self.EMBEDDING_DIM = 100
        self.HIDDEN_DIM = 50

        self.nerData = prepareData()
        self.evaluation = evaluate()
        self.text = findEntity()
        self.data, self.tags = self.text.corpus2BIO(mode=self.mode)

        START_TAG = "<START>"
        STOP_TAG = "<STOP>"
        if self.mode == "withIntermediate":
            self.tag_to_ix = {"None": 0, "B-PER":1, "I-PER":2, "B-LOC":3, "I-LOC":4, "B-ORG":5, "I-ORG": 6, START_TAG:7, STOP_TAG:8} #Version 2
        else:
            self.tag_to_ix = {"None": 0, "I-PER": 1, "I-LOC": 2, "I-ORG":3, START_TAG: 4, STOP_TAG: 5} #Version 1

        if self.use=="10000Kalimat":
            if self.combine_embeddings==True:
                print("loading combinded embeddings, option: 1000Kalimat")
                self.word2vec = self.nerData.restore_model("./Embeddings/word2vec_with10000Kalimat_50Dimension.pic")
                self.tag2vec = self.nerData.restore_model("./Embeddings/tag2vec_with10000Kalimat_50Dimension.pic")
                self.dataPOSTag = self.nerData.restore_model("./Embeddings/tagFeed_with10000Kalimat.pic")
            else:
                print("loading word embeddings, option: 1000Kalimat")
                self.word2vec = self.nerData.restore_model("./Embeddings/word2vec_with10000Kalimat_100Dimension.pic")

        elif self.use=="TweetData":
            if self.combine_embeddings==True:
                print("loading combinded embeddings, option: TweetData")
                self.word2vec = self.nerData.restore_model("./Embeddings/word2vec_withTweetData_50Dimension.pic")
                self.tag2vec = self.nerData.restore_model("./Embeddings/tag2vec_withTweetData_50Dimension.pic")
                self.dataPOSTag = self.nerData.restore_model("./Embeddings/tagFeed_withTweetData.pic")
            else:
                print("loading word embeddings2, option: TweetData")
                self.word2vec = self.nerData.restore_model("./Embeddings/word2vec_withTweetData2_100Dimension.pic")

        elif self.use=="All":
            if self.combine_embeddings==True:
                print("loading combinded embeddings, option: All")
                self.word2vec = self.nerData.restore_model("./Embeddings/word2vec_All_50Dimension.pic")
                self.tag2vec = self.nerData.restore_model("./Embeddings/tag2vec_All_50Dimension.pic")
                self.dataPOSTag = self.nerData.restore_model("./Embeddings/tagFeed_All.pic")
            else:
                print("loading word embeddings, option: All")
                self.word2vec = self.nerData.restore_model("./Embeddings/word2vec_All_100Dimension.pic")
示例#2
0
# elif use=="All":
#     if combine_embeddings==True:
#         print("loading combinded embeddings, option: All")
#         word2vec = nerData.restore_model("./Embeddings/word2vec_All_50Dimension.pic")
#         tag2vec = nerData.restore_model("./Embeddings/tag2vec_All_50Dimension.pic")
#         dataPOSTag = nerData.restore_model("./Embeddings/tagFeed_All.pic")
#     else:
#         print("loading word embeddings, option: All")
#         word2vec = nerData.restore_model("./Embeddings/word2vec_All_100Dimension.pic")

# toFeed = nerData.restore_model("./Embeddings/toFeed_withTweetData.pic")
# tagFeed = nerData.restore_model("./Embeddings/tagFeed_withTweetData.pic")
toFeed = nerData.getDefaultData()
toFeed = nerData.getTweetData(toFeed, filename="MoreTweets.tsv")
#Incorporate testCorpus to embeddings
testCorpus = findEntity(filename="test.txt")
tokenizedTest = testCorpus.corpus2BIO()[0]
for tokens in tokenizedTest:
    toFeed.append(tokens)

testTags = nerData.getTestTag(tokenizedTest)
# for element in testTags:
# tagFeed.append(element)

if combine_embeddings == True:
    word2vec = nerData.getWord2Vec(toFeed)
    tag2vec = nerData.getTag2Vec(tagFeed)
else:
    word2vec = nerData.getWord2Vec(toFeed, dim=100)

entityTagger = entityTagger()
示例#3
0
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from rujukKateglo import rujukKateglo

combine_embeddings = False

#External Arguments
testData = sys.argv[1]
experimentNum = int(sys.argv[2])
# verbosity = sys.argv[2]
#Assumes that the data is in the same directory as berkicau.py
directoryTestData = "./"

#Data preperation object
nerData = prepareData()
#TestData
testCorpus = findEntity(dir=directoryTestData, filename=testData)
tokenizedTest = testCorpus.corpus2BIO()[0]

if experimentNum == 1:
    combine_embeddings = True
    print("Experiment 1: Word2Vec + Tag2Vec with TweetData")
    modelFolder = re.split(
        '\n',
        open("./Models/models_TweetData_Combined.txt", "rb").read())
    toFeed = nerData.restore_model("./Embeddings/toFeed_withTweetData.pic")
    tagFeed = nerData.restore_model("./Embeddings/tagFeed_withTweetData.pic")
    #Incorporate testCorpus to embeddings
    for tokens in tokenizedTest:
        toFeed.append(tokens)

    if combine_embeddings == True:
示例#4
0
        tag2vec = nerData.restore_model(
            "./Embeddings/tag2vec_All_50Dimension.pic")
        dataPOSTag = nerData.restore_model("./Embeddings/tagFeed_All.pic")
    else:
        print("loading word embeddings, option: All")
        word2vec = nerData.restore_model(
            "./Embeddings/word2vec_All_100Dimension.pic")

# nerData.restore_model("toFeed_with10000Kalimat.pic")
# nerData.restore_model("tagFeed_with10000Kalimat.pic")
# nerData.restore_model("toFeed_withTweetData.pic")
# nerData.restore_model("tagFeed_withTweetData.pic")
# nerData.restore_model("toFeed_All.pic")
# nerData.restore_model("tagFeed_All.pic")

text = findEntity()
data, tags = text.corpus2BIO(mode=mode)

START_TAG = "<START>"
STOP_TAG = "<STOP>"
EMBEDDING_DIM = 100
HIDDEN_DIM = 50
if mode == "withIntermediate":
    tag_to_ix = {
        "None": 0,
        "B-PER": 1,
        "I-PER": 2,
        "B-LOC": 3,
        "I-LOC": 4,
        "B-ORG": 5,
        "I-ORG": 6,
示例#5
0
 def __init__(self, dir="./Datasets/", filename=None):
     super(evaluate, self).__init__()
     if filename==None:
         self.text = findEntity()
     else:
         self.text = findEntity(filename=filename)