def __init__(self,mode = "None", use = "TweetData", combine_embeddings=False, BATCHSIZE=64.0): #use = "1000Kalimat", "TweetData" self.mode = mode self.use = use self.combine_embeddings = combine_embeddings self.BATCHSIZE = BATCHSIZE self.EMBEDDING_DIM = 100 self.HIDDEN_DIM = 50 self.nerData = prepareData() self.evaluation = evaluate() self.text = findEntity() self.data, self.tags = self.text.corpus2BIO(mode=self.mode) START_TAG = "<START>" STOP_TAG = "<STOP>" if self.mode == "withIntermediate": self.tag_to_ix = {"None": 0, "B-PER":1, "I-PER":2, "B-LOC":3, "I-LOC":4, "B-ORG":5, "I-ORG": 6, START_TAG:7, STOP_TAG:8} #Version 2 else: self.tag_to_ix = {"None": 0, "I-PER": 1, "I-LOC": 2, "I-ORG":3, START_TAG: 4, STOP_TAG: 5} #Version 1 if self.use=="10000Kalimat": if self.combine_embeddings==True: print("loading combinded embeddings, option: 1000Kalimat") self.word2vec = self.nerData.restore_model("./Embeddings/word2vec_with10000Kalimat_50Dimension.pic") self.tag2vec = self.nerData.restore_model("./Embeddings/tag2vec_with10000Kalimat_50Dimension.pic") self.dataPOSTag = self.nerData.restore_model("./Embeddings/tagFeed_with10000Kalimat.pic") else: print("loading word embeddings, option: 1000Kalimat") self.word2vec = self.nerData.restore_model("./Embeddings/word2vec_with10000Kalimat_100Dimension.pic") elif self.use=="TweetData": if self.combine_embeddings==True: print("loading combinded embeddings, option: TweetData") self.word2vec = self.nerData.restore_model("./Embeddings/word2vec_withTweetData_50Dimension.pic") self.tag2vec = self.nerData.restore_model("./Embeddings/tag2vec_withTweetData_50Dimension.pic") self.dataPOSTag = self.nerData.restore_model("./Embeddings/tagFeed_withTweetData.pic") else: print("loading word embeddings2, option: TweetData") self.word2vec = self.nerData.restore_model("./Embeddings/word2vec_withTweetData2_100Dimension.pic") elif self.use=="All": if self.combine_embeddings==True: print("loading combinded embeddings, option: All") self.word2vec = self.nerData.restore_model("./Embeddings/word2vec_All_50Dimension.pic") self.tag2vec = self.nerData.restore_model("./Embeddings/tag2vec_All_50Dimension.pic") self.dataPOSTag = self.nerData.restore_model("./Embeddings/tagFeed_All.pic") else: print("loading word embeddings, option: All") self.word2vec = self.nerData.restore_model("./Embeddings/word2vec_All_100Dimension.pic")
# elif use=="All": # if combine_embeddings==True: # print("loading combinded embeddings, option: All") # word2vec = nerData.restore_model("./Embeddings/word2vec_All_50Dimension.pic") # tag2vec = nerData.restore_model("./Embeddings/tag2vec_All_50Dimension.pic") # dataPOSTag = nerData.restore_model("./Embeddings/tagFeed_All.pic") # else: # print("loading word embeddings, option: All") # word2vec = nerData.restore_model("./Embeddings/word2vec_All_100Dimension.pic") # toFeed = nerData.restore_model("./Embeddings/toFeed_withTweetData.pic") # tagFeed = nerData.restore_model("./Embeddings/tagFeed_withTweetData.pic") toFeed = nerData.getDefaultData() toFeed = nerData.getTweetData(toFeed, filename="MoreTweets.tsv") #Incorporate testCorpus to embeddings testCorpus = findEntity(filename="test.txt") tokenizedTest = testCorpus.corpus2BIO()[0] for tokens in tokenizedTest: toFeed.append(tokens) testTags = nerData.getTestTag(tokenizedTest) # for element in testTags: # tagFeed.append(element) if combine_embeddings == True: word2vec = nerData.getWord2Vec(toFeed) tag2vec = nerData.getTag2Vec(tagFeed) else: word2vec = nerData.getWord2Vec(toFeed, dim=100) entityTagger = entityTagger()
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory from rujukKateglo import rujukKateglo combine_embeddings = False #External Arguments testData = sys.argv[1] experimentNum = int(sys.argv[2]) # verbosity = sys.argv[2] #Assumes that the data is in the same directory as berkicau.py directoryTestData = "./" #Data preperation object nerData = prepareData() #TestData testCorpus = findEntity(dir=directoryTestData, filename=testData) tokenizedTest = testCorpus.corpus2BIO()[0] if experimentNum == 1: combine_embeddings = True print("Experiment 1: Word2Vec + Tag2Vec with TweetData") modelFolder = re.split( '\n', open("./Models/models_TweetData_Combined.txt", "rb").read()) toFeed = nerData.restore_model("./Embeddings/toFeed_withTweetData.pic") tagFeed = nerData.restore_model("./Embeddings/tagFeed_withTweetData.pic") #Incorporate testCorpus to embeddings for tokens in tokenizedTest: toFeed.append(tokens) if combine_embeddings == True:
tag2vec = nerData.restore_model( "./Embeddings/tag2vec_All_50Dimension.pic") dataPOSTag = nerData.restore_model("./Embeddings/tagFeed_All.pic") else: print("loading word embeddings, option: All") word2vec = nerData.restore_model( "./Embeddings/word2vec_All_100Dimension.pic") # nerData.restore_model("toFeed_with10000Kalimat.pic") # nerData.restore_model("tagFeed_with10000Kalimat.pic") # nerData.restore_model("toFeed_withTweetData.pic") # nerData.restore_model("tagFeed_withTweetData.pic") # nerData.restore_model("toFeed_All.pic") # nerData.restore_model("tagFeed_All.pic") text = findEntity() data, tags = text.corpus2BIO(mode=mode) START_TAG = "<START>" STOP_TAG = "<STOP>" EMBEDDING_DIM = 100 HIDDEN_DIM = 50 if mode == "withIntermediate": tag_to_ix = { "None": 0, "B-PER": 1, "I-PER": 2, "B-LOC": 3, "I-LOC": 4, "B-ORG": 5, "I-ORG": 6,
def __init__(self, dir="./Datasets/", filename=None): super(evaluate, self).__init__() if filename==None: self.text = findEntity() else: self.text = findEntity(filename=filename)