def updateTrValCandDict(self, trValCandDict_pkl, crosswikis_pkl,
                            knwn_wid_vocab_pkl, *args):
        if not os.path.exists(trValCandDict_pkl):
            print("Train/Val CWiki Candidates Dict doesn't exist")
            sys.exit()

        print("Updating TrValKwnCandDict for : ")

        print("Loading trvalCandsDict ... ")
        candsDict = utils.load(trValCandDict_pkl)
        print("TrValCandDictSize : {}".format(len(candsDict)))
        self.crosswikis_dict = utils.load_crosswikis(crosswikis_pkl)
        print("Loading known wid2idx dict")
        (self.knwid2idx, self.idx2knwid) = utils.load(knwn_wid_vocab_pkl)
        print("Adding candidates for additional mentions")

        datasetsToUpdate = args
        for dataset in datasetsToUpdate:
            test_file = dataset
            print(test_file)
            mentions = utils.make_mentions_from_file(mens_file=test_file)
            self._addCandidatesForAdditionalMentions(mentions, candsDict)
            print("Size now : {}".format(len(candsDict)))

        utils.save(trValCandDict_pkl, candsDict)
        print("TrValCandDictSize : {}".format(len(candsDict)))
예제 #2
0
    def __init__(self, config, vocabloader, test_mentions_file):
        ''' Updates a test crosswikis which is the original crosswikis pruned but
        only with surfaces from test data

        There are 2 dictionaries that mare maintained :
        test_kwn_cwiki : Only has candidates that are in KnownEntity set
        test_all_cwiki : All entities from KB can be candidates (i.e. full cwikis)
        '''

        if not os.path.exists(config.test_kwnen_cwikis_pkl):
            print("Test Known Entity CWiki does not exist ... ")
            self.test_kwn_cwiki = {}
        else:
            self.test_kwn_cwiki = utils.load(config.test_kwnen_cwikis_pkl)
        print("Size of test known en cwiki : {}".format(
            len(self.test_kwn_cwiki)))

        if not os.path.exists(config.test_allen_cwikis_pkl):
            print("Test Data All Entity CWiki does not exist ... ")
            self.test_all_cwiki = {}
        else:
            self.test_all_cwiki = utils.load(config.test_allen_cwikis_pkl)
        print("Size of test all en cwiki : {}".format(len(
            self.test_all_cwiki)))

        # Known WID Vocab
        print("[#] Loading Known Entities Vocab : ")
        (self.knwid2idx, self.idx2knwid) = vocabloader.getKnwnWidVocab()
        self.num_knwn_entities = len(self.idx2knwid)
        print(" [#] Loaded. Num of known wids : {}".format(
            self.num_knwn_entities))

        self.crosswikis_dict = utils.load_crosswikis(config.crosswikis_pkl)
        '''
예제 #3
0
    def __init__(self, test_mentions_file, word_vocab_pkl, label_vocab_pkl,
                 knwn_wid_vocab_pkl):
        self.start_word = start_word
        self.end_word = end_word
        self.unk_word = 'unk'  # In tune with word2vec
        self.unk_wid = "<unk_wid>"
        self.tr_sup = 'tr_sup'
        self.tr_unsup = 'tr_unsup'

        if not (os.path.exists(word_vocab_pkl)
                and os.path.exists(label_vocab_pkl)
                and os.path.exists(knwn_wid_vocab_pkl)):
            print(
                "Atleast one vocab not found. Run vocabs.py before running model."
            )
            sys.exit()

        # Word VOCAB
        print("[#] Loading word vocab ... ")
        (self.word2idx, self.idx2word) = utils.load(word_vocab_pkl)
        self.num_words = len(self.idx2word)
        print(" [#] Word vocab loaded. Size of vocab : {}".format(
            self.num_words))

        # Label Vocab
        print("[#] Loading label vocab ... ")
        (self.label2idx, self.idx2label) = utils.load(label_vocab_pkl)
        self.num_labels = len(self.idx2label)
        print(" [#] Label vocab loaded. Number of labels : {}".format(
            self.num_labels))

        # Known WID Vocab
        print("[#] Loading Known Entities Vocab : ")
        (self.knwid2idx, self.idx2knwid) = utils.load(knwn_wid_vocab_pkl)
        self.num_knwn_entities = len(self.idx2knwid)
        print(" [#] Loaded. Num of known wids : {}".format(
            self.num_knwn_entities))

        # Crosswikis
        #print("[#] Loading training/val crosswikis dictionary ... ")
        #self.crosswikis_dict = utils.load_crosswikis(trval_crosswikis_pkl)

        print("[#] Test Mentions File : {}".format(test_mentions_file))

        print("[#] Loading test mentions ... ")
        self.test_mentions = self._make_mentions_from_file(test_mentions_file)
        self.num_test_mentions = len(self.test_mentions)
        print("[#] Test Mentions : {}".format(self.num_test_mentions))

        print("\n[#] LOADING COMPLETE")
예제 #4
0
 def getTrainValCandidateDict(self):
     if self.trval_cands_dict == None:
         if not os.path.exists(self.config.trval_kwnidx_cands_pkl):
             print("Train Validation Candidate Dict missing")
             sys.exit()
         self.trval_cands_dict = utils.load(self.config.trval_kwnidx_cands_pkl)
     return self.trval_cands_dict
예제 #5
0
 def getLabelVocab(self):
     if self.label2idx == None or self.idx2label == None:
         if not os.path.exists(self.config.label_vocab_pkl):
             print("Label Vocab PKL missing")
             sys.exit()
         (self.label2idx, self.idx2label) = utils.load(self.config.label_vocab_pkl)
     return (self.label2idx, self.idx2label)
예제 #6
0
 def getTestAllEnCwiki(self):
     if self.test_allen_cwikis == None:
         if not os.path.exists(self.config.test_allen_cwikis_pkl):
             print("Test All Entity CWikis Dict missing")
             sys.exit()
         self.test_allen_cwikis = utils.load(self.config.test_allen_cwikis_pkl)
     return self.test_allen_cwikis
예제 #7
0
 def loadCrosswikis(self):
     if self.crosswikis == None:
         if not os.path.exists(self.config.crosswikis_pkl):
             print("Crosswikis pkl missing")
             sys.exit()
         self.crosswikis = utils.load(self.config.crosswikis_pkl)
     return self.crosswikis
예제 #8
0
 def getWID2TypeLabels(self):
     if self.wid2TypeLabels == None:
         if not os.path.exists(self.config.wid2typelabels_vocab_pkl):
             print("wid2TypeLabels pkl missing")
             sys.exit()
         self.wid2TypeLabels = utils.load(self.config.wid2typelabels_vocab_pkl)
     return self.wid2TypeLabels
예제 #9
0
 def loadKnownWIDDescVecs(self):
     if self.knownwid2descvecs == None:
         if not os.path.exists(self.config.knownwid2descvectors):
             print("Known WIDS Description Vectors PKL missing")
             sys.exit()
         self.knownwid2descvecs = utils.load(self.config.knownwid2descvectors)
     return self.knownwid2descvecs
예제 #10
0
 def getWordVocab(self):
     if self.word2idx == None or self.idx2word == None:
         if not os.path.exists(self.config.word_vocab_pkl):
             print("Word Vocab PKL missing")
             sys.exit()
         (self.word2idx, self.idx2word) = utils.load(self.config.word_vocab_pkl)
     return (self.word2idx, self.idx2word)
예제 #11
0
 def loadGloveNumpy(self):
     if self.glovenumpy is None:
         if not os.path.exists(self.config.glove_numpy_pkl):
             print("Glove_Numpy doesnot exist")
             sys.exit()
         self.glovenumpy = utils.load(self.config.glove_numpy_pkl)
     return self.glovenumpy
예제 #12
0
 def loadGloveVectors(self):
     if self.glove2vec == None:
         if not os.path.exists(self.config.glove_pkl):
             print("Glove_Vectors_PKL doesnot exist")
             sys.exit()
         self.glove2vec = utils.load(self.config.glove_pkl)
     return self.glove2vec
예제 #13
0
 def getWID2Wikititle(self):
     if self.wid2Wikititle == None:
         if not os.path.exists(self.config.widWiktitle_pkl):
             print("wid2Wikititle pkl missing")
             sys.exit()
         self.wid2Wikititle = utils.load(self.config.widWiktitle_pkl)
     return self.wid2Wikititle
예제 #14
0
 def getKnwnWidVocab(self):
     if self.knwid2idx == None or self.idx2knwid == None:
         if not os.path.exists(self.config.kwnwid_vocab_pkl):
             print("Known Entities Vocab PKL missing")
             sys.exit()
         (self.knwid2idx, self.idx2knwid) = utils.load(self.config.kwnwid_vocab_pkl)
     return (self.knwid2idx, self.idx2knwid)
예제 #15
0
 def getTestCandidateDict(self):
     if self.test_kwnen_cands_dict is None:
         if not os.path.exists(self.config.test_kwnen_cands_pkl):
             print("Train Validation Candidate Dict missing")
             sys.exit()
         self.test_kwnen_cands_dict = utils.load(
             self.config.test_kwnen_cands_pkl)
     return self.test_kwnen_cands_dict
예제 #16
0
 def getCrosswikisSlice(self):
     if self.cwikis_slice == None:
         if not os.path.exists(self.config.crosswikis_slice):
             print("CWikis Slice Dict missing")
             sys.exit()
         print("Loading CWIKI Slice")
         self.cwikis_slice = utils.load(self.config.crosswikis_slice)
     return self.cwikis_slice
예제 #17
0
 def loadPrunedCrosswikis(self):
     if self.crosswikis_pruned == None:
         if not os.path.exists(self.config.crosswikis_pruned_pkl):
             print("Crosswikis Pruned Does not exist.")
             sys.exit()
         self.crosswikis_pruned = utils.load(
             self.config.crosswikis_pruned_pkl)
     return self.crosswikis_pruned
예제 #18
0
 def getGloveWordVocab(self):
     if self.gword2idx == None or self.gidx2word == None:
         if not os.path.exists(self.config.glove_word_vocab_pkl):
             print("Glove Word Vocab PKL missing")
             sys.exit()
         print("Loading Glove Word Vocabulary")
         (self.gword2idx,
          self.gidx2word) = utils.load(self.config.glove_word_vocab_pkl)
     return (self.gword2idx, self.gidx2word)
예제 #19
0
 def getTestKnwEnCwiki(self):
     if self.test_knwen_cwikis == None:
         if not os.path.exists(self.config.test_kwnen_cwikis_pkl):
             print("Test Known Entity CWikis Dict missing")
             sys.exit()
         print("Loading Test Data Known Entity CWIKI")
         self.test_knwen_cwikis = utils.load(
             self.config.test_kwnen_cwikis_pkl)
     return self.test_knwen_cwikis
    def __init__(self, config, vocabloader):
        self.tr_mens_dir = config.train_mentions_dir
        self.tr_mens_files = utils.get_mention_files(self.tr_mens_dir)

        print("Loading Coherence String Dicts")
        (coh2idx, idx2coh) = utils.load(config.cohstring_vocab_pkl)
        (cohG92idx, idx2cohG9) = utils.load(config.cohstringG9_vocab_pkl)

        print("Coherence Stirng set Size : {}, cnt >= 10 size : {}".format(
            len(idx2coh), len(idx2cohG9)))

        self.testDataCountCohLessMens(config.val_mentions_file, cohG92idx)
        self.testDataCountCohLessMens(config.test_mentions_file, cohG92idx)

        self.testDataCountCohLessMens(config.ace_mentions_file, cohG92idx)

        self.testDataCountCohLessMens(config.aida_inkb_dev_file, cohG92idx)
        self.testDataCountCohLessMens(config.aida_inkb_test_file, cohG92idx)
        self.testDataCountCohLessMens(config.aida_inkb_train_file, cohG92idx)
예제 #21
0
    def __init__(self, config, vocabloader):

        self.new_knw_wid_vocab = "/save/ngupta19/wikipedia/wiki_mentions/wcoh/vocab/new/new_knwn_wid_vocab.pkl"

        (self.knwid2idx, self.idx2knwid) = utils.load(self.new_knw_wid_vocab)

        newfile = "/save/ngupta19/wikipedia/wiki_mentions/wcoh/newmentions.txt"
        self.new_mentions = utils.make_mentions_from_file(newfile)

        self.coldWIDS = set()
예제 #22
0
    def __init__(self, config, vocabloader):
        print("Loading Crosswikis")
        # self.crosswikis = vocabloader.loadCrosswikis()

        stime = time.time()
        self.crosswikis = utils.load(config.crosswikis_pruned_pkl)
        ttime = time.time() - stime
        print("Crosswikis Loaded. Size : {}".format(len(self.crosswikis)))
        print("Time taken : {} secs".format(ttime))

        (self.knwid2idx, self.idx2knwid) = vocabloader.getKnwnWidVocab()
        print("Size of known wids : {}".format(len(self.knwid2idx)))
예제 #23
0
 def __init__(self, figermodel):
     print(
         "######   ENTERED THE COLD WORLD OF THE UNKNOWN    ##############")
     # Object of the WikiELModel Class
     self.fm = figermodel
     self.coldDir = self.fm.reader.coldDir
     coldWid2DescVecs_pkl = os.path.join(self.coldDir,
                                         "coldwid2descvecs.pkl")
     self.coldWid2DescVecs = utils.load(coldWid2DescVecs_pkl)
     self.num_cold_entities = self.fm.reader.num_cold_entities
     self.batch_size = self.fm.batch_size
     (self.coldwid2idx, self.idx2coldwid) = (self.fm.reader.coldwid2idx,
                                             self.fm.reader.idx2coldwid)
예제 #24
0
    def __init__(self,
                 config,
                 vocabloader,
                 val_file,
                 num_cands,
                 batch_size,
                 strict_context=True,
                 pretrain_wordembed=True,
                 wordDropoutKeep=1.0,
                 cohDropoutKeep=1.0):
        '''
        Reader especially for training data, but can be used for test data as
        validation and test file inputs. The requirement is that the mention candidates
        should be added to the TrValCandidateDict using readers.train.crosswikis_vocab

        DataType 0/1 corresponds to train/val_file
        '''
        self.config = config
        self.batch_size = batch_size
        print("[#] Initializing Training Reader Batch Size: {}".format(
            self.batch_size))
        stime = time.time()
        self.start_word = start_word
        self.end_word = end_word
        self.unk_word = 'unk'  # In tune with glove
        self.unk_wid = "<unk_wid>"
        self.pretrain_wordembed = pretrain_wordembed
        assert 0.0 < wordDropoutKeep <= 1.0
        self.wordDropoutKeep = wordDropoutKeep
        assert 0.0 < cohDropoutKeep <= 1.0
        self.cohDropoutKeep = cohDropoutKeep
        self.num_cands = num_cands
        self.strict_context = strict_context

        # Coherence String Vocab
        (self.cohG92idx,
         self.idx2cohG9) = utils.load(config.cohstringG9_vocab_pkl)
        self.num_cohstr = len(self.idx2cohG9)
        print("[#] Coherence Loaded. Num Coherence Strings: {}".format(
            self.num_cohstr))

        self.tr_mens_dir = config.train_mentions_dir
        self.tr_mens_files = utils.get_mention_files(self.tr_mens_dir)
        self.num_tr_mens_files = len(self.tr_mens_files)
        print("[#] Training Mention Files : {} files".format(
            self.num_tr_mens_files))

        etime = time.time()
        ttime = etime - stime
        print("[#] TRAINING READER LOADING COMPLETE. "
              "Time Taken: {} secs\n".format(ttime))
    def __init__(self, config, vocabloader):
        ''' Used to make pruned crosswikis dict and candidate dictionary
        for training and validation data

        train_val_cwikis_pkl : Slice of crosswikis for surfaces in train/val (NOT USED)

        train_val_cwikis_cands_pkl: Train/Val data only contain known entities
        This dict acts as pre-cache of mention candidates.
        key   : (LNRM(surface), WID)
        Value : ([Candidate_IDXs], [CProbs])
        Candidate_Idxs : The first idx is the true wid_idx, rest are candidates
        Padded with Unk_Wid_Idx(=0) if less than number of candidates needed.
        '''
        self.config = config
        train_mentions_dir = config.train_mentions_dir
        val_mentions_file = config.val_mentions_file
        test_mentions_file = config.test_mentions_file

        tr_mens_files = utils.get_mention_files(train_mentions_dir)
        self.numc = 30
        (self.knwid2idx, self.idx2knwid) = vocabloader.getKnwnWidVocab()
        self.wid2WikiTitle = vocabloader.getWID2Wikititle()

        if not os.path.exists(config.trval_kwnidx_cands_pkl):
            self.crosswikis_dict = vocabloader.loadPrunedCrosswikis()
            print("Crosswikis Loaded. Size: {}".format(
                len(self.crosswikis_dict)))

            print("Size of known entities: {}".format(len(self.knwid2idx)))
            print("Making Train/Validation/Test CWiki Candidates.\n"
                  "{Key:(surface, wid), V: ([CandWids], [PriorProbs])")
            train_val_candidates_dict = self.make_train_val_candidatesDict(
                train_mentions_dir, tr_mens_files, val_mentions_file,
                test_mentions_file)
            utils.save(config.trval_kwnidx_cands_pkl,
                       train_val_candidates_dict)
            print("Train/Val Candidates Dict Saved")
            sys.exit(0)
        else:
            print("Train/Val CWiki Candidates already exists")
            trval_cand_dict = utils.load(train_val_cwikis_cands_pkl)
            print("Loaded dict")
            key = ('barackobama', '534366')
            (candidates, probs) = (trval_cand_dict[key][0],
                                   trval_cand_dict[key][1])
            candidates = [self.idx2knwid[wididx] for wididx in candidates]
            candidates = [self.wid2WikiTitle[wid] for wid in candidates]

            print((key, candidates, probs))
예제 #26
0
    def __init__(self, config, vocabloader, test_mens_file,
                 num_cands, batch_size, strict_context=True,
                 pretrain_wordembed=True, coherence=True,
                 glove=True):
        print("Loading Test Reader: {}".format(test_mens_file))
        self.typeOfReader="test"
        self.start_word = start_word
        self.end_word = end_word
        self.unk_word = 'unk'  # In tune with word2vec
        self.unk_wid = "<unk_wid>"
        # self.useKnownEntitesOnly = True
        self.pretrain_wordembed = pretrain_wordembed
        self.coherence = coherence

        # Word Vocab
        (self.word2idx, self.idx2word) = vocabloader.getGloveWordVocab()
        self.num_words = len(self.idx2word)
        print(" [#] Word vocab loaded. Size of vocab : {}".format(
            self.num_words))

        # Label Vocab
        (self.label2idx, self.idx2label) = vocabloader.getLabelVocab()
        self.num_labels = len(self.idx2label)
        print(" [#] Label vocab loaded. Number of labels : {}".format(
            self.num_labels))

        # Known WID Vocab
        (self.knwid2idx, self.idx2knwid) = vocabloader.getKnwnWidVocab()
        self.num_knwn_entities = len(self.idx2knwid)
        print(" [#] Loaded. Num of known wids : {}".format(
            self.num_knwn_entities))

        # Wid2Wikititle Map
        self.wid2WikiTitle = vocabloader.getWID2Wikititle()
        print(" [#] Size of Wid2Wikititle: {}".format(len(
            self.wid2WikiTitle)))

        # # Wid2TypeLabels Map
        # self.wid2TypeLabels = vocabloader.getWID2TypeLabels()
        # print(" [#] Total number of Wids : {}".format(len(
        #     self.wid2TypeLabels)))

        # Coherence String Vocab
        print("Loading Coherence Strings Dicts ... ")
        (self.cohG92idx, self.idx2cohG9) = utils.load(
            config.cohstringG9_vocab_pkl)
        self.num_cohstr = len(self.idx2cohG9)
        print(" [#] Number of Coherence Strings in Vocab : {}".format(
            self.num_cohstr))

        # Known WID Description Vectors
        # self.kwnwid2descvecs = vocabloader.loadKnownWIDDescVecs()
        # print(" [#] Size of kwn wid desc vecs dict : {}".format(
        #     len(self.kwnwid2descvecs)))

        # # Crosswikis
        # print("[#] Loading training/val crosswikis dictionary ... ")
        # self.test_kwnen_cwikis = vocabloader.getTestKnwEnCwiki()
        # self.test_allen_cwikis = vocabloader.getTestAllEnCwiki()

        # Crosswikis
        print("Loading Crosswikis dict. (takes ~2 mins to load)")
        self.crosswikis = utils.load(config.crosswikis_pruned_pkl)
        # self.crosswikis = {}
        print("Crosswikis loaded. Size: {}".format(len(self.crosswikis)))

        if self.pretrain_wordembed:
            stime = time.time()
            self.word2vec = vocabloader.loadGloveVectors()
            print("[#] Glove Vectors loaded!")
            ttime = (time.time() - stime)/float(60)
            print("[#] Time to load vectors : {} mins".format(ttime))

        print("[#] Test Mentions File : {}".format(test_mens_file))

        print("[#] Pre-loading test mentions ... ")
        self.mentions = utils.make_mentions_from_file(test_mens_file)
        self.men_idx = 0
        self.num_mens = len(self.mentions)
        self.epochs = 0
        print( "[#] Test Mentions : {}".format(self.num_mens))

        self.batch_size = batch_size
        print("[#] Batch Size: %d" % self.batch_size)
        self.num_cands = num_cands
        self.strict_context = strict_context

        print("\n[#]LOADING COMPLETE")
예제 #27
0
    def __init__(self,
                 config,
                 vocabloader,
                 test_mens_file,
                 num_cands,
                 batch_size,
                 strict_context=True,
                 pretrain_wordembed=True,
                 coherence=True):
        self.pipeline = remote_pipeline.RemotePipeline(
            server_api='http://austen.cs.illinois.edu:5800')
        self.typeOfReader = "inference"
        self.start_word = start_word
        self.end_word = end_word
        self.unk_word = 'unk'  # In tune with word2vec
        self.unk_wid = "<unk_wid>"
        self.tr_sup = 'tr_sup'
        self.tr_unsup = 'tr_unsup'
        self.pretrain_wordembed = pretrain_wordembed
        self.coherence = coherence

        # Word Vocab
        (self.word2idx, self.idx2word) = vocabloader.getGloveWordVocab()
        self.num_words = len(self.idx2word)

        # Label Vocab
        (self.label2idx, self.idx2label) = vocabloader.getLabelVocab()
        self.num_labels = len(self.idx2label)

        # Known WID Vocab
        (self.knwid2idx, self.idx2knwid) = vocabloader.getKnwnWidVocab()
        self.num_knwn_entities = len(self.idx2knwid)

        # Wid2Wikititle Map
        self.wid2WikiTitle = vocabloader.getWID2Wikititle()

        # Coherence String Vocab
        print("Loading Coherence Strings Dicts ... ")
        (self.cohG92idx,
         self.idx2cohG9) = utils.load(config.cohstringG9_vocab_pkl)
        self.num_cohstr = len(self.idx2cohG9)

        # Crosswikis
        print("Loading Crosswikis dict. (takes ~2 mins to load)")
        self.crosswikis = utils.load(config.crosswikis_pruned_pkl)
        print("Crosswikis loaded. Size: {}".format(len(self.crosswikis)))

        if self.pretrain_wordembed:
            stime = time.time()
            self.word2vec = vocabloader.loadGloveVectors()
            print("[#] Glove Vectors loaded!")
            ttime = (time.time() - stime) / float(60)
            print("[#] Time to load vectors : {} mins".format(ttime))

        print("[#] Test Mentions File : {}".format(test_mens_file))

        print("[#] Loading test file and preprocessing ... ")
        self.processTestDoc(test_mens_file)
        self.mention_lines = self.convertSent2NerToMentionLines()
        self.mentions = []
        for line in self.mention_lines:
            m = Mention(line)
            self.mentions.append(m)

        self.men_idx = 0
        self.num_mens = len(self.mentions)
        self.epochs = 0
        print("[#] Test Mentions : {}".format(self.num_mens))

        self.batch_size = batch_size
        print("[#] Batch Size: %d" % self.batch_size)
        self.num_cands = num_cands
        self.strict_context = strict_context

        print("\n[#]LOADING COMPLETE")
예제 #28
0
    def __init__(self,
                 config,
                 widWikititle_file,
                 widLabel_file,
                 word_threshold=1):
        '''Given training data, makes word vocab, glove word vocab,
           doc_mentions vocab, type lables vocab, known_wid vocab,
           wid2Wikititle
        '''
        self.start_word = start_word
        self.end_word = end_word
        self.unk_word = 'unk'  # In tune with word2vec
        self.unk_wid = "<unk_wid>"

        self.tr_mens_dir = config.train_mentions_dir
        self.tr_mens_files = utils.get_mention_files(self.tr_mens_dir)
        self.num_tr_mens_files = len(self.tr_mens_files)
        print("[#] Training Mention Files : {} files".format(
            self.num_tr_mens_files))

        print("[#] Validation Mentions File : {}".format(
            config.val_mentions_file))

        tr_data_vocabs_exist = self.check_train_data_vocabs_exist(
            config.word_vocab_pkl, config.label_vocab_pkl,
            config.kwnwid_vocab_pkl, config.cohstring_vocab_pkl,
            config.cohstringG1_vocab_pkl)

        if not tr_data_vocabs_exist:
            print("[#] Loading pretrained word2vec embeddings .. ")
            self.word2vec = gensim.models.Word2Vec.load_word2vec_format(
                config.word2vec_bin_gz, binary=True)
            self.word2vec.init_sims(replace=True)

            print("All/Some Training Vocabs do not exist. Making ... ")
            self.make_training_data_vocabs(
                self.tr_mens_dir, self.tr_mens_files, config.word_vocab_pkl,
                config.label_vocab_pkl, config.kwnwid_vocab_pkl,
                config.cohstring_vocab_pkl, config.cohstringG1_vocab_pkl,
                config.cohstringG9_vocab_pkl, word_threshold)

        if not os.path.exists(config.widWiktitle_pkl):
            print(" [#] Making wid2Wikititle Map")
            wid2Wikititle = self.make_widWikititleMap(widWikititle_file)
            utils.save(config.widWiktitle_pkl, wid2Wikititle)
            print(" [#] Done. Size : {}".format(len(wid2Wikititle)))

        if not os.path.exists(config.wid2typelabels_vocab_pkl):
            print(" [#] Making wid2Types Map")
            wid2types = self.make_wid2TypesMap(widLabel_file)
            utils.save(config.wid2typelabels_vocab_pkl, wid2types)
            print(" [#] Done. Size : {}".format(len(wid2types)))

        if not os.path.exists(config.glove_word_vocab_pkl):
            print(" [#] Makign GloVe Word Vocabs")
            glove2vec = utils.load(config.glove_pkl)
            print("   [#] Glove embeddings loaded. Size: {}".format(
                len(glove2vec)))
            (glove_word2idx,
             glove_idx2word) = self.make_glovewordvocab(glove2vec)
            utils.save(config.glove_word_vocab_pkl,
                       (glove_word2idx, glove_idx2word))
예제 #29
0
    def __init__(self,
                 config,
                 vocabloader,
                 num_cands,
                 batch_size,
                 strict_context=True,
                 pretrain_wordembed=True,
                 coherence=True):
        self.typeOfReader = "inference"
        self.start_word = start_word
        self.end_word = end_word
        self.unk_word = 'unk'  # In tune with word2vec
        self.unk_wid = "<unk_wid>"
        self.tr_sup = 'tr_sup'
        self.tr_unsup = 'tr_unsup'
        self.pretrain_wordembed = pretrain_wordembed
        self.coherence = coherence

        # Word Vocab
        (self.word2idx, self.idx2word) = vocabloader.getGloveWordVocab()
        self.num_words = len(self.idx2word)

        # Label Vocab
        (self.label2idx, self.idx2label) = vocabloader.getLabelVocab()
        self.num_labels = len(self.idx2label)

        # Known WID Vocab
        (self.knwid2idx, self.idx2knwid) = vocabloader.getKnwnWidVocab()
        self.num_knwn_entities = len(self.idx2knwid)

        # Wid2Wikititle Map
        self.wid2WikiTitle = vocabloader.getWID2Wikititle()

        # Coherence String Vocab
        print("Loading Coherence Strings Dicts ... ")
        (self.cohG92idx,
         self.idx2cohG9) = utils.load(config.cohstringG9_vocab_pkl)
        self.num_cohstr = len(self.idx2cohG9)

        # Crosswikis
        print("Loading Crosswikis dict. (takes ~2 mins to load)")
        self.crosswikis = utils.load(config.crosswikis_pruned_pkl)
        print("Crosswikis loaded. Size: {}".format(len(self.crosswikis)))

        if self.pretrain_wordembed:
            stime = time.time()
            self.word2vec = vocabloader.loadGloveVectors()
            print("[#] Glove Vectors loaded!")
            ttime = (time.time() - stime) / float(60)

        # print("[#] Test Mentions File : {}".format(test_mens_file))

        # print("[#] Loading test file and preprocessing ... ")
        # with open(test_mens_file, 'r') as f:
        #     tajsonstr = f.read()
        # ta = TextAnnotation(json_str=tajsonstr)
        #
        # (sentences_tokenized, modified_ner_cons_list) = self.processTestDoc(ta)
        #
        # self.mention_lines = self.convertSent2NerToMentionLines(
        #     sentences_tokenized, modified_ner_cons_list)
        #
        # self.mentions = []
        # for line in self.mention_lines:
        #     m = Mention(line)
        #     self.mentions.append(m)

        self.men_idx = 0
        # self.num_mens = len(self.mentions)
        self.epochs = 0
        # print( "[#] Test Mentions : {}".format(self.num_mens))

        self.batch_size = batch_size
        print("[#] Batch Size: %d" % self.batch_size)
        self.num_cands = num_cands
        self.strict_context = strict_context

        print("\n[#]LOADING COMPLETE")
예제 #30
0
def main(_):
    pp.pprint(flags.FLAGS.__flags)

    output_file = "data/output.json"  #sys.argv[2]

    range_start = 0  #int(sys.argv[3])
    range_end = 10  #int(sys.argv[4])

    file_name = "data/qanta.train.2018.04.18.json"  #sys.argv[1]
    question_list = json.loads(open(file_name).read())["questions"]
    sentences = question_list[range_start:min(range_end, len(question_list))]

    FLAGS_check(FLAGS)

    config = Config(FLAGS.config, verbose=False)
    vocabloader = VocabLoader(config)

    print("Loading in variables!")
    word2idx, idx2word = vocabloader.getGloveWordVocab()
    wid2WikiTitle = vocabloader.getWID2Wikititle()
    crosswikis = utils.load(config.crosswikis_pruned_pkl)
    word2vec = vocabloader.loadGloveVectors()
    print("DONE LOADING IN VARIABLES!!!")

    all_entities = []

    for sent in sentences:
        tf.reset_default_graph()
        loc = config.test_file.replace(
            "sampletest.txt", "{}_{}.txt".format(range_start, range_end))
        w = open(loc, "w")
        config.test_file = loc
        sent["text"] = decrypt(sent["text"].replace("\xa0", " "))
        w.write(sent["text"].encode("ascii", "ignore").decode("ascii"))
        print(sent["text"].encode("ascii", "ignore").decode("ascii"))
        w.close()
        FLAGS.dropout_keep_prob = 1.0
        FLAGS.wordDropoutKeep = 1.0
        FLAGS.cohDropoutKeep = 1.0
        start = time.time()
        print("Test file {} ".format(config.test_file))
        reader = InferenceReader(config=config,
                                 vocabloader=vocabloader,
                                 test_mens_file=config.test_file,
                                 num_cands=FLAGS.num_cand_entities,
                                 batch_size=FLAGS.batch_size,
                                 word2idx=word2idx,
                                 idx2word=idx2word,
                                 wid2WikiTitle=wid2WikiTitle,
                                 crosswikis=crosswikis,
                                 word2vec=word2vec,
                                 strict_context=FLAGS.strict_context,
                                 pretrain_wordembed=FLAGS.pretrain_wordembed,
                                 coherence=FLAGS.coherence)
        print("Took {} time to create inference reader".format(time.time() -
                                                               start))
        docta = reader.ccgdoc
        model_mode = 'inference'

        config_proto = tf.ConfigProto()
        config_proto.allow_soft_placement = True
        config_proto.gpu_options.allow_growth = True
        sess = tf.Session(config=config_proto)

        print("COHSTR", reader.num_cohstr)
        """with sess.as_default():

            start = time.time()
            model = ELModel(
                sess=sess, reader=reader, dataset=FLAGS.dataset,
                max_steps=FLAGS.max_steps,
                pretrain_max_steps=FLAGS.pretraining_steps,
                word_embed_dim=FLAGS.word_embed_dim,
                context_encoded_dim=FLAGS.context_encoded_dim,
                context_encoder_num_layers=FLAGS.context_encoder_num_layers,
                context_encoder_lstmsize=FLAGS.context_encoder_lstmsize,
                coherence_numlayers=FLAGS.coherence_numlayers,
                jointff_numlayers=FLAGS.jointff_numlayers,
                learning_rate=FLAGS.learning_rate,
                dropout_keep_prob=FLAGS.dropout_keep_prob,
                reg_constant=FLAGS.reg_constant,
                checkpoint_dir=FLAGS.checkpoint_dir,
                optimizer=FLAGS.optimizer,
                mode=model_mode,
                strict=FLAGS.strict_context,
                pretrain_word_embed=FLAGS.pretrain_wordembed,
                typing=FLAGS.typing,
                el=FLAGS.el,
                coherence=FLAGS.coherence,
                textcontext=FLAGS.textcontext,
                useCNN=FLAGS.useCNN,
                WDLength=FLAGS.WDLength,
                Fsize=FLAGS.Fsize,
                entyping=FLAGS.entyping)

            print("Loading EL Model took {} time".format(time.time()-start))

            print("Doing inference")

            try:
                start = time.time()
                (predTypScNPmat_list,
                widIdxs_list,
                priorProbs_list,
                textProbs_list,
                jointProbs_list,
                evWTs_list,
                pred_TypeSetsList) = model.inference(ckptpath=FLAGS.model_path)
                print("Inference took {} time".format(time.time()-start))
            except:
                entity_list = {'qanta_id':sent['qanta_id'],'mentions':[]}
                all_entities.append(entity_list)
                print("No entities")
                continue
 
            start = time.time()
            numMentionsInference = len(widIdxs_list)
            numMentionsReader = 0
            for sent_idx in reader.sentidx2ners:
                numMentionsReader += len(reader.sentidx2ners[sent_idx])
            assert numMentionsInference == numMentionsReader

            mentionnum = 0
            entityTitleList = []
            print("Tokenized sentences {}".format(reader.sentences_tokenized))
            for sent_idx in reader.sentidx2ners:
                nerDicts = reader.sentidx2ners[sent_idx]
                sentence = ' '.join(reader.sentences_tokenized[sent_idx])
                for s, ner in nerDicts:
                    [evWTs, evWIDS, evProbs] = evWTs_list[mentionnum]
                    predTypes = pred_TypeSetsList[mentionnum]

                    entityTitleList.append(evWTs[2])
                    mentionnum += 1

            elview = copy.deepcopy(docta.view_dictionary['NER_CONLL'])
            elview.view_name = 'ENG_NEURAL_EL'
            for i, cons in enumerate(elview.cons_list):
                cons['label'] = entityTitleList[i]

            docta.view_dictionary['ENG_NEURAL_EL'] = elview

            print("Processing took {} time".format(time.time()-start))

            print("List of entities")
            #print(elview.cons_list)
            print("\n")
            
            s = sent["text"]
            print("New S is {}".format(s))
            e = elview.cons_list
            t = reader.sentences_tokenized 
            c = []
            f = []

            print(s)
            #print("E {}".format(e))
            print("T {}".format(t))

            for i in t:
                for j in i:
                    f.append(j)
            i = 0
            token_pointer = 0
            while token_pointer < len(f) and i < len(s):
                token_len = len(f[token_pointer])
                while i+token_len<len(s) and s[i:i+token_len] != f[token_pointer]:
                    i+=1
                c.append((i,token_len+i))
                i+=1
                token_pointer+=1
            if len(c) != len(f):
                print("ERROR in C and F")           
            unflattened_c = []
            c_pointer = 0
            for i in range(len(t)):
                l = c[c_pointer:c_pointer+len(t[i])]
                c_pointer+=len(t[i])
                unflattened_c.append(l)

            #print("C {}".format(c))
            #print("F {}".format(f))
            #print("Unflattened C {}".format(unflattened_c)) 

            entity_list = {'qanta_id':sent['qanta_id'],'mentions':[]}
            sentence_num = 0
               
            UNK = "<unk_wid>"
            for i in range(len(e)):
                if e[i]["label"]!=UNK:
                    all_words = False
                    while not all_words and sentence_num < len(t):
                        all_words = True
                        #print(e[i])
                        for word in range(e[i]["start"],e[i]["end"]+1):
                            if len(t[sentence_num])<=word or t[sentence_num][word] not in e[i]["tokens"]:
                                all_words = False
                        if not all_words:
                            sentence_num+=1
                    if sentence_num == len(t):
                        print("Error with sentence_num")
                    else:
                        entity_list['mentions'].append({'entity':e[i]["label"],'span':[unflattened_c[sentence_num][e[i]['start']][0],unflattened_c[sentence_num][e[i]['end']][1]]})
            #print("Entity list is {}".format(entity_list))

            all_entities.append(entity_list)
            local_vars = list(locals().items())
            del reader
     
            del predTypScNPmat_list
            del widIdxs_list
            del priorProbs_list
            del textProbs_list
            del jointProbs_list
            del evWTs_list
            del model
            del pred_TypeSetsList
            print("Memory usage {}".format(getCurrentMemoryUsage()))
            #print("All entities are {}".format(all_entities))
        del sess"""
        gc.collect()
        tf.reset_default_graph()

    w = open(output_file, "w")
    w.write(json.dumps(all_entities))
    w.close()

    print("Dumped JSON, all done")
    print("Took {} time".format(time.time() - prog_start))
    return
    sys.exit()