def make_mentions_from_file(mens_file, verbose=False): stime = time.time() with open(mens_file, 'r') as f: mention_lines = f.read().strip().split("\n") mentions = [] for line in mention_lines: mentions.append(Mention(line)) ttime = (time.time() - stime) if verbose: print(" ## Time in loading {} mens : {} secs".format(mens_file, ttime)) return mentions
def new_ta(self, ta): self.textanno = ta (sentences_tokenized, modified_ner_cons_list) = self.processTestDoc(ta) self.mention_lines = self.convertSent2NerToMentionLines( sentences_tokenized, modified_ner_cons_list) self.mentions = [] for line in self.mention_lines: m = Mention(line) self.mentions.append(m) self.men_idx = 0 self.num_mens = len(self.mentions) self.epochs = 0
def new_test_file(self, test_mens_file): self.test_mens_file = test_mens_file with open(test_mens_file, 'r') as f: tajsonstr = f.read() ta = TextAnnotation(json_str=tajsonstr) self.textanno = ta (sentences_tokenized, modified_ner_cons_list) = self.processTestDoc(ta) self.mention_lines = self.convertSent2NerToMentionLines( sentences_tokenized, modified_ner_cons_list) self.mentions = [] for line in self.mention_lines: m = Mention(line) self.mentions.append(m) self.men_idx = 0 self.num_mens = len(self.mentions) self.epochs = 0
def __init__(self, config, vocabloader, test_mens_file, num_cands, batch_size, strict_context=True, pretrain_wordembed=True, coherence=True): self.pipeline = remote_pipeline.RemotePipeline( server_api='http://austen.cs.illinois.edu:5800') self.typeOfReader = "inference" self.start_word = start_word self.end_word = end_word self.unk_word = 'unk' # In tune with word2vec self.unk_wid = "<unk_wid>" self.tr_sup = 'tr_sup' self.tr_unsup = 'tr_unsup' self.pretrain_wordembed = pretrain_wordembed self.coherence = coherence # Word Vocab (self.word2idx, self.idx2word) = vocabloader.getGloveWordVocab() self.num_words = len(self.idx2word) # Label Vocab (self.label2idx, self.idx2label) = vocabloader.getLabelVocab() self.num_labels = len(self.idx2label) # Known WID Vocab (self.knwid2idx, self.idx2knwid) = vocabloader.getKnwnWidVocab() self.num_knwn_entities = len(self.idx2knwid) # Wid2Wikititle Map self.wid2WikiTitle = vocabloader.getWID2Wikititle() # Coherence String Vocab print("Loading Coherence Strings Dicts ... ") (self.cohG92idx, self.idx2cohG9) = utils.load(config.cohstringG9_vocab_pkl) self.num_cohstr = len(self.idx2cohG9) # Crosswikis print("Loading Crosswikis dict. (takes ~2 mins to load)") self.crosswikis = utils.load(config.crosswikis_pruned_pkl) print("Crosswikis loaded. Size: {}".format(len(self.crosswikis))) if self.pretrain_wordembed: stime = time.time() self.word2vec = vocabloader.loadGloveVectors() print("[#] Glove Vectors loaded!") ttime = (time.time() - stime) / float(60) print("[#] Time to load vectors : {} mins".format(ttime)) print("[#] Test Mentions File : {}".format(test_mens_file)) print("[#] Loading test file and preprocessing ... ") self.processTestDoc(test_mens_file) self.mention_lines = self.convertSent2NerToMentionLines() self.mentions = [] for line in self.mention_lines: m = Mention(line) self.mentions.append(m) self.men_idx = 0 self.num_mens = len(self.mentions) self.epochs = 0 print("[#] Test Mentions : {}".format(self.num_mens)) self.batch_size = batch_size print("[#] Batch Size: %d" % self.batch_size) self.num_cands = num_cands self.strict_context = strict_context print("\n[#]LOADING COMPLETE")