示例#1
0
def make_mentions_from_file(mens_file, verbose=False):
    stime = time.time()
    with open(mens_file, 'r') as f:
        mention_lines = f.read().strip().split("\n")
        mentions = []
        for line in mention_lines:
            mentions.append(Mention(line))
    ttime = (time.time() - stime)
    if verbose:
        print(" ## Time in loading {} mens : {} secs".format(mens_file, ttime))
    return mentions
示例#2
0
    def new_ta(self, ta):
        self.textanno = ta

        (sentences_tokenized, modified_ner_cons_list) = self.processTestDoc(ta)

        self.mention_lines = self.convertSent2NerToMentionLines(
            sentences_tokenized, modified_ner_cons_list)

        self.mentions = []
        for line in self.mention_lines:
            m = Mention(line)
            self.mentions.append(m)

        self.men_idx = 0
        self.num_mens = len(self.mentions)
        self.epochs = 0
    def new_test_file(self, test_mens_file):
        self.test_mens_file = test_mens_file

        with open(test_mens_file, 'r') as f:
            tajsonstr = f.read()
        ta = TextAnnotation(json_str=tajsonstr)
        self.textanno = ta

        (sentences_tokenized, modified_ner_cons_list) = self.processTestDoc(ta)

        self.mention_lines = self.convertSent2NerToMentionLines(
            sentences_tokenized, modified_ner_cons_list)

        self.mentions = []
        for line in self.mention_lines:
            m = Mention(line)
            self.mentions.append(m)

        self.men_idx = 0
        self.num_mens = len(self.mentions)
        self.epochs = 0
示例#4
0
    def __init__(self,
                 config,
                 vocabloader,
                 test_mens_file,
                 num_cands,
                 batch_size,
                 strict_context=True,
                 pretrain_wordembed=True,
                 coherence=True):
        self.pipeline = remote_pipeline.RemotePipeline(
            server_api='http://austen.cs.illinois.edu:5800')
        self.typeOfReader = "inference"
        self.start_word = start_word
        self.end_word = end_word
        self.unk_word = 'unk'  # In tune with word2vec
        self.unk_wid = "<unk_wid>"
        self.tr_sup = 'tr_sup'
        self.tr_unsup = 'tr_unsup'
        self.pretrain_wordembed = pretrain_wordembed
        self.coherence = coherence

        # Word Vocab
        (self.word2idx, self.idx2word) = vocabloader.getGloveWordVocab()
        self.num_words = len(self.idx2word)

        # Label Vocab
        (self.label2idx, self.idx2label) = vocabloader.getLabelVocab()
        self.num_labels = len(self.idx2label)

        # Known WID Vocab
        (self.knwid2idx, self.idx2knwid) = vocabloader.getKnwnWidVocab()
        self.num_knwn_entities = len(self.idx2knwid)

        # Wid2Wikititle Map
        self.wid2WikiTitle = vocabloader.getWID2Wikititle()

        # Coherence String Vocab
        print("Loading Coherence Strings Dicts ... ")
        (self.cohG92idx,
         self.idx2cohG9) = utils.load(config.cohstringG9_vocab_pkl)
        self.num_cohstr = len(self.idx2cohG9)

        # Crosswikis
        print("Loading Crosswikis dict. (takes ~2 mins to load)")
        self.crosswikis = utils.load(config.crosswikis_pruned_pkl)
        print("Crosswikis loaded. Size: {}".format(len(self.crosswikis)))

        if self.pretrain_wordembed:
            stime = time.time()
            self.word2vec = vocabloader.loadGloveVectors()
            print("[#] Glove Vectors loaded!")
            ttime = (time.time() - stime) / float(60)
            print("[#] Time to load vectors : {} mins".format(ttime))

        print("[#] Test Mentions File : {}".format(test_mens_file))

        print("[#] Loading test file and preprocessing ... ")
        self.processTestDoc(test_mens_file)
        self.mention_lines = self.convertSent2NerToMentionLines()
        self.mentions = []
        for line in self.mention_lines:
            m = Mention(line)
            self.mentions.append(m)

        self.men_idx = 0
        self.num_mens = len(self.mentions)
        self.epochs = 0
        print("[#] Test Mentions : {}".format(self.num_mens))

        self.batch_size = batch_size
        print("[#] Batch Size: %d" % self.batch_size)
        self.num_cands = num_cands
        self.strict_context = strict_context

        print("\n[#]LOADING COMPLETE")