def createDics(self, bin_dataframe,pretrained_embeddings):

        # bin_np = bin_dataframe.as_matrix()
        bin_np = bin_dataframe.to_numpy()

        if pretrained_embeddings==False: # maybe not need this!

            self.word_to_ix["<unk>"] = len(self.word_to_ix)

        #initialize the event dictionary
        self.event_to_ix["non-event"]= len(self.event_to_ix)
        self.event_to_ix["event"]= len(self.event_to_ix)

        # initialize the tags dictionary
        self.tag_to_ix["B-Other"] = len(self.tag_to_ix)
        self.tag_to_ix["I-Other"] = len(self.tag_to_ix)

        for line in bin_np:
            if line[1] != None:
                if pretrained_embeddings==True:
                    continue
                else:
                    for word in utils.strToLst(line[1]):
                        if word not in self.word_to_ix:
                            self.word_to_ix[word] = len(self.word_to_ix)

            else:
                tag = utils.strToLst(line[0])['corrected_tags']
                if tag not in self.tag_to_ix:
                    self.tag_to_ix[tag] = len(self.tag_to_ix)

        self.BIOset, self.ECset = utils.getSortedTagsFromBIO(self.tag_to_ix)
        self.tag_to_ix = utils.getSegmentationDict(self.BIOset)
        self.ec_to_ix = utils.getSegmentationDict(self.ECset)
Exemplo n.º 2
0
    def __init__(self, file):
        docNr = -1
        self.head_docs = []
        tokens = headIdDoc("")

        for i in range(file.shape[0]):
            if '#doc' in file[i][0] or i == file.shape[
                    0] - 1:  # append all docs including the last one
                if (i == file.shape[0] - 1):  # append last line
                    tokens.append(int(file[i][0]), file[i][1], file[i][2],
                                  utils.strToLst(file[i][3]),
                                  utils.strToLst(file[i][4]))  # append lines
                if (docNr != -1):
                    self.head_docs.append(tokens)
                docNr += 1
                tokens = headIdDoc(file[i][0])
            else:
                tokens.append(int(file[i][0]), file[i][1], file[i][2],
                              utils.strToLst(file[i][3]),
                              utils.strToLst(file[i][4]))  # append lines
    def __init__(self,fname):


        config_file=parsers.read_properties(fname)
        #print("\nConfiguration file {} loaded \n".format(fname))
        self.config_fname=fname

        # load data
        self.pretrained_embeddings=utils.strToBool(config_file.getProperty("pretrained_embeddings"))


        self.filename_embeddings = config_file.getProperty("filename_embeddings")

        #print(os.path.basename(self.filename_embeddings))

        name_of_embeddings = ""


        self.embeddings_size=int(config_file.getProperty("embeddings_size"))
        self.word_to_ix={}

        if self.pretrained_embeddings==True:

            name_of_embeddings = "_"+os.path.basename(self.filename_embeddings)

            if os.path.isfile(self.filename_embeddings+".pkl")==False:
                        self.wordvectors,  self.embeddings_size, self.word_to_ix = utils.readWordvectorsNumpy(self.filename_embeddings, isBinary=True if self.filename_embeddings.endswith(".bin") else False)


                        joblib.dump(( self.wordvectors,  self.embeddings_size, self.word_to_ix), self.filename_embeddings+".pkl")

            else:
                    self.wordvectors, self.embeddings_size, self.word_to_ix = joblib.load(self.filename_embeddings + ".pkl")  # loading is faster



        self.filename_train=config_file.getProperty("filename_train")
        self.filename_dev = config_file.getProperty("filename_dev")
        self.filename_test=config_file.getProperty("filename_test")
        self.pad_length = int(config_file.getProperty("pad_length"))
        '''
        train = reader.BinDataset(self.filename_train, isTrain=True, pretrained_embeddings=self.pretrained_embeddings,
                                  word_to_ix=self.word_to_ix,pad_length=self.pad_length)
        self.word_to_ix, self.tag_to_ix, self.event_to_ix, self.ec_to_ix = train.getDictionaries()
        
        dev = reader.BinDataset(self.filename_dev,isTrain=False, word_to_ix=self.word_to_ix, tag_to_ix=self.tag_to_ix, event_to_ix=self.event_to_ix,pad_length=self.pad_length)
        test = reader.BinDataset(self.filename_test,isTrain=False, word_to_ix=self.word_to_ix, tag_to_ix=self.tag_to_ix, event_to_ix=self.event_to_ix,pad_length=self.pad_length)
        '''
        if os.path.isfile(self.filename_train +name_of_embeddings+"_"+str(self.pad_length)+ "_tweet.pkl") == False:

            train = reader.BinDataset(self.filename_train,isTrain=True,pretrained_embeddings=self.pretrained_embeddings,word_to_ix=self.word_to_ix,pad_length=self.pad_length)

            joblib.dump(train, self.filename_train +name_of_embeddings+"_"+str(self.pad_length)+  "_tweet.pkl")

        else:
            train = joblib.load(self.filename_train+name_of_embeddings+"_"+str(self.pad_length)+  "_tweet.pkl")  # loading is faster

        self.word_to_ix, self.tag_to_ix, self.event_to_ix, self.ec_to_ix = train.getDictionaries()
        

        if os.path.isfile(self.filename_dev +name_of_embeddings+"_"+str(self.pad_length)+   "_tweet.pkl") == False:

            dev = reader.BinDataset(self.filename_dev,isTrain=False, word_to_ix=self.word_to_ix, tag_to_ix=self.tag_to_ix, event_to_ix=self.event_to_ix,pad_length=self.pad_length)

            joblib.dump(dev, self.filename_dev +name_of_embeddings+"_"+str(self.pad_length)+   "_tweet.pkl")

        else:
            dev = joblib.load(self.filename_dev +name_of_embeddings+"_"+str(self.pad_length)+   "_tweet.pkl")  # loading is faster



        if os.path.isfile(self.filename_test +name_of_embeddings+"_"+str(self.pad_length)+   "_tweet.pkl") == False:

            test = reader.BinDataset(self.filename_test,isTrain=False, word_to_ix=self.word_to_ix, tag_to_ix=self.tag_to_ix, event_to_ix=self.event_to_ix,pad_length=self.pad_length)

            joblib.dump(test, self.filename_test +name_of_embeddings+"_"+str(self.pad_length)+   "_tweet.pkl")

        else:
            test = joblib.load(self.filename_test +name_of_embeddings+"_"+str(self.pad_length)+   "_tweet.pkl")  # loading is faster
        

        
        print (train)

        self.train_loader = DataLoader(train, batch_size=1, shuffle=False)
        self.dev_loader = DataLoader(dev, batch_size=1, shuffle=False)
        self.test_loader = DataLoader(test, batch_size=1, shuffle=False)

        print ()
        #self.dev_id_docs = parsers.readHeadFile( self.filename_dev)
        #self.test_id_docs = parsers.readHeadFile(self.filename_test)

        # get labels for the whole collection
        #dataset_documents = []
        #dataset_documents.extend(self.train_id_docs)
        #dataset_documents.extend(self.dev_id_docs)
        #dataset_documents.extend(self.test_id_docs)
        #self.dataset_set_characters = utils.getCharsFromDocuments(dataset_documents)
        #self.dataset_set_bio_tags, self.dataset_set_ec_tags = utils.getEntitiesFromDocuments(dataset_documents)
        #self.dataset_set_relations = utils.getRelationsFromDocuments(dataset_documents)
        #print (len(self.dataset_set_characters))
        #print(len(self.dataset_set_bio_tags))

        #print((self.dataset_set_characters))
        # print((self.dataset_set_bio_tags))








       # training
        self.nepochs = int(config_file.getProperty("nepochs"))
        self.optimizer = config_file.getProperty("optimizer")
        #self.activation =config_file.getProperty("activation")
        self.learning_rate =float(config_file.getProperty("learning_rate"))
                                                                                             
        #self.nepoch_no_imprv = int(config_file.getProperty("nepoch_no_imprv"))
        self.use_dropout = utils.strToBool(config_file.getProperty("use_dropout"))
        self.use_BIO_LSTM = utils.strToBool(config_file.getProperty("use_BIO_LSTM"))
        self.ner_loss = config_file.getProperty("ner_loss")
        self.ner_classes = config_file.getProperty("ner_classes")
        self.bin_features = config_file.getProperty("bin_features").lower()
        self.tweet_representation = config_file.getProperty("tweet_representation").lower()
        self.non_linearity_bin_features = config_file.getProperty("non_linearity_bin_features").lower()
        try:
            self.threshold = float(config_file.getProperty("threshold"))
        except:
            self.threshold=0
                                                                                           


        # hyperparameters
        self.n_filters        = int(config_file.getProperty("n_filters"))
        self.filter_sizes     = utils.strToLst(config_file.getProperty("filter_sizes"))
        self.batch_norm       = utils.strToBool(config_file.getProperty("batch_norm"))
        self.cnn_pool         = config_file.getProperty("cnn_pool").lower()
        self.dropout_cnn      = float(config_file.getProperty("dropout_cnn"))
        self.bin_representation = config_file.getProperty("bin_representation").lower()
        self.dropout_lstm1_output = float(config_file.getProperty("dropout_lstm1_output"))




        self.dropout_embedding = float(config_file.getProperty("dropout_embedding"))
        #self.dropout_lstm = float(config_file.getProperty("dropout_lstm"))
        self.dropout_lstm2_output = float(config_file.getProperty("dropout_lstm2_output"))
        self.dropout_fcl_ner = float(config_file.getProperty("dropout_fcl_ner"))
        self.dropout_fcl_rel = float(config_file.getProperty("dropout_fcl_rel"))
        #self.hidden_size_lstm =int(config_file.getProperty("hidden_size_lstm"))
        self.hidden_dim = int(config_file.getProperty("hidden_dim"))
        #self.hidden_size_n2 = config_file.getProperty("hidden_size_n2")
        self.num_lstm_layers = int(config_file.getProperty("num_lstm_layers"))
        #self.char_embeddings_size = int(config_file.getProperty("char_embeddings_size"))
        #self.hidden_size_char = int(config_file.getProperty("hidden_size_char"))
        #self.label_embeddings_size = int(config_file.getProperty("label_embeddings_size"))
        #self.alpha = float(config_file.getProperty("alpha"))

        # evaluation
        self.evaluation_method =config_file.getProperty("evaluation_method")
        #self.root_node=bool(config_file.getProperty("root_node"))

        self.shuffle=False
    def preprocess(self, bin_dataframe):
        # bin_np = bin_dataframe.as_matrix()
        bin_np = bin_dataframe.to_numpy()
        docNr = -1

        bin_tweets = []

        bin_tweet_lengths=[]
        bin_tweets_text=[]

        previous_match = ""

        match = []
        for i in range(bin_np.shape[0]):

            if bin_np[i][1] == None or i == bin_np.shape[0] - 1:  # append all docs including the last one
                if (i == bin_np.shape[0] - 1):  # append last line
                    tweet_text = utils.lstToString(utils.strToLst(bin_np[i][1])).split()
                    tweet, tweet_length = utils.prepare_sequence(
                        tweet_text, self.word_to_ix,
                        pad_length=self.pad_length)
                    bin_tweets.append(tweet)
                    bin_tweet_lengths.append(tweet_length)
                    bin_tweets_text.append(tweet_text)


                if (docNr != -1):
                    #bin_tweets = np.asarray(bin_tweets)


                    try:
                        tag_id = self.tag_to_ix[target]

                        if target.startswith("B-") or target.startswith("I-"):
                            ec_id=self.ec_to_ix[target[2:]]
                        else:
                            ec_id=self.ec_to_ix[target]
                    except:
                        # print(target)
                        if target.startswith("B-"):
                            tag_id = self.tag_to_ix["B-Other"]

                        elif target.startswith("I-"):
                            tag_id = self.tag_to_ix["I-Other"]

                        ec_id = self.ec_to_ix["Other"]

                    if target=="O":
                        event_duration_idx = self.event_to_ix["non-event"]
                    else:
                        event_duration_idx = self.event_to_ix["event"]                                  
                    if event_id==-1:
                        independent_event_idx = self.event_to_ix["non-event"]
                    else:
                        independent_event_idx = self.event_to_ix["event"]

                    #print (len(bin_tweets))
                    #print (torch.stack(bin_tweets))
                    match.append([torch.stack(bin_tweets), tag_id,ec_id,event_duration_idx,independent_event_idx,event_type,event_id,bin_tweet_lengths])

                    #print (utils.getDictionaryKeyByIdx(self.tag_to_ix,tag_id),utils.getDictionaryKeyByIdx(self.ec_to_ix,ec_id),utils.getDictionaryKeyByIdx(self.event_to_ix,event_id))

                    # match=np.append(match,bin_tokens)
                    # match['match_bins'].append(bin)

                docNr += 1
                if i != bin_np.shape[0] - 1:
                    infoDict = utils.strToLst(bin_np[i][0])
                    # print('infoDict', infoDict)

                    if previous_match != infoDict['doc']:
                        # print (infoDict['doc'])

                        # match = {'match_bins': np.empty((0)),"match_name": infoDict['doc']}
                        previous_match = infoDict['doc']

                        # below two lines should be interchanged i think
                        match = []
                        self.matches.append(match)

                    bin_tweets = []
                    bin_tweet_lengths=[]
                    bin_tweets_text=[]
                    target = infoDict['corrected_tags']
                    event_type = infoDict['event_type']
                    event_id = infoDict['event_id']
                    match_name= infoDict['doc']                          


                    # {'bin': infoDict['bin'],'targets': infoDict['corrected_tags'],'tweets':[],'timestamps':[],'tokens':""}
            else:

                # bin['tweets'].append(strToLst(bin_np[i][1]))
                # bin_tokens+=" "+lstToString(strToLst(bin_np[i][1]))
                # bin['timestamps'].append(int(bin_np[i][0]))
                # print ((lstToString(strToLst(bin_np[i][1])).split()))
                #print (bin_tokens)
                tweet_text=utils.lstToString(utils.strToLst(bin_np[i][1])).split()
                tweet,tweet_length=utils.prepare_sequence(tweet_text, self.word_to_ix,
                                       pad_length=self.pad_length)

                bin_tweets.append(tweet)
                bin_tweet_lengths.append(tweet_length)
                bin_tweets_text.append(tweet_text)