def learn_triplets_cooccur_mat(triplets_file_path): files = glob.glob(triplets_file_path) np_voc = vocabulary.Vocabulary() vp_voc = vocabulary.Vocabulary() np_voc.load('../mat/np1.voc') vp_voc.load('../mat/np2.voc') num_np = np_voc.size() num_vp = vp_voc.size() cooccur_mat = zeros([num_np, num_vp]) for file_in in files: with open(file_in, 'r') as f: for line in f: if (line[0] != '<'): line = (line[:-2]).lower() triplets = line.split('|') np1 = cleansing.clean(triplets[0].split()) vp = cleansing.clean(triplets[1].split()) np2 = cleansing.clean(triplets[2].split()) for w in np2: vp.append(w) np1_new = [w for w in np1 if np_voc.contain(w)] vp_new = [w for w in vp if vp_voc.contain(w)] pairs = [(np_voc.get_word_index(u), vp_voc.get_word_index(v)) for u in np1_new for v in vp_new] for pair in pairs: cooccur_mat[pair[0], pair[1]] += 1 return cooccur_mat
def learn_triplets_cooccur_mat(triplets_file_path): files = glob.glob(triplets_file_path) np_voc = vocabulary.Vocabulary() vp_voc = vocabulary.Vocabulary() np_voc.load('../mat/np1.voc') vp_voc.load('../mat/np2.voc') num_np = np_voc.size() num_vp = vp_voc.size() cooccur_mat = zeros([num_np, num_vp]) for file_in in files: with open(file_in, 'r') as f: for line in f: if(line[0] != '<'): line = (line[:-2]).lower() triplets = line.split('|') np1 = cleansing.clean(triplets[0].split()) vp = cleansing.clean(triplets[1].split()) np2 = cleansing.clean(triplets[2].split()) for w in np2: vp.append(w) np1_new = [w for w in np1 if np_voc.contain(w)] vp_new = [w for w in vp if vp_voc.contain(w)] pairs = [(np_voc.get_word_index(u), vp_voc.get_word_index(v)) for u in np1_new for v in vp_new] for pair in pairs: cooccur_mat[pair[0], pair[1]] += 1 return cooccur_mat
def learn_story_histogram(file_in, words_voc, word_type='ALL', ocr_file=None): """word_type = 'NP1', 'VP', 'NP2', 'ALL'""" words_voc_num = words_voc.size() hist = np.zeros([1, words_voc_num]) document = [] with open(file_in, 'r') as f: for line in f: if(line[0] != '<'): line = (line[:-2]).lower() words = [] triplets = line.split('|') if (word_type == 'NP1' or word_type == 'ALL'): words.extend(cleansing.clean(triplets[0].split())) if (word_type == 'VP' or word_type == 'ALL'): words.extend(cleansing.clean(triplets[1].split())) if (word_type == 'NP2' or word_type == 'ALL'): words.extend(cleansing.clean(triplets[2].split())) words_new = [w for w in words if words_voc.contain(w)] document.extend(words_new) for w in words_new: hist[0, words_voc.get_word_index(w)] += 1 # Read OCR file and combine into histogram if provided. ocr_words = [] if ocr_file is not None: # TODO: # read ocr file # combine OCR into histogram # each word count 2 # fill ocr_words with open(ocr_file, 'r') as f: for line in f: if(line[0] != '<'): line = (line[:-2]).lower() ocr_words.extend(cleansing.clean(line.split())) ocr_words_new = [w for w in ocr_words if words_voc.contain(w)] document.extend(ocr_words_new) for w in ocr_words_new: hist[0, words_voc.get_word_index(w)] += 1 # Normalize. sum_hist = hist.sum() if sum_hist != 0: hist = hist / sum_hist try: assert(hist.sum() > 0.9 and hist.sum() < 1.1) except AssertionError: print(hist) print(hist.sum()) raise return (hist, document, ocr_words)
def learn_story_histogram(file_in, words_voc, word_type="ALL", ocr_file=None): """word_type = 'NP1', 'VP', 'NP2', 'ALL'""" words_voc_num = words_voc.size() hist = np.zeros([1, words_voc_num]) document = [] with open(file_in, "r") as f: for line in f: if line[0] != "<": line = (line[:-2]).lower() words = [] triplets = line.split("|") if word_type == "NP1" or word_type == "ALL": words.extend(cleansing.clean(triplets[0].split())) if word_type == "VP" or word_type == "ALL": words.extend(cleansing.clean(triplets[1].split())) if word_type == "NP2" or word_type == "ALL": words.extend(cleansing.clean(triplets[2].split())) words_new = [w for w in words if words_voc.contain(w)] document.extend(words_new) for w in words_new: hist[0, words_voc.get_word_index(w)] += 1 # Read OCR file and combine into histogram if provided. ocr_words = [] if ocr_file is not None: # TODO: # read ocr file # combine OCR into histogram # each word count 2 # fill ocr_words with open(ocr_file, "r") as f: for line in f: if line[0] != "<": line = (line[:-2]).lower() ocr_words.extend(cleansing.clean(line.split())) ocr_words_new = [w for w in ocr_words if words_voc.contain(w)] document.extend(ocr_words_new) for w in ocr_words_new: hist[0, words_voc.get_word_index(w)] += 1 # Normalize. sum_hist = hist.sum() if sum_hist != 0: hist = hist / sum_hist try: assert hist.sum() > 0.9 and hist.sum() < 1.1 except AssertionError: print (hist) print (hist.sum()) raise return (hist, document, ocr_words)
def learn_triplets_cooccur_mat(file_in, co_mat_file): learned_co_mat = CooccurMatrix() learned_co_mat.load(co_mat_file) np_voc = learned_co_mat.vocabulary np1_matrix = learned_co_mat.matrix np1_all = vocabulary.Vocabulary() with open(file_in, 'r') as f: for line in f: if (line[0] != '<'): line = (line[:-2]).lower() triplets = line.split('|') np1 = cleansing.clean(triplets[0].split()) # Delete words not in the similarity vocabulary. np1_new = [w for w in np1 if np_voc.contain(w)] for w in np1_new: np1_all.add(w) num_np1 = np1_all.size() similarity_mat_np1 = zeros([num_np1, num_np1]) for i in range(num_np1): for j in range(num_np1): similarity_mat_np1[i, j] = np1_matrix[ np_voc.get_word_index(np1_all.get_word(i)), np_voc.get_word_index(np1_all.get_word(j))] return CooccurMatrix(similarity_mat_np1, np1_all)
def ocr_find_story(ocr_results, ocr_time, stories): reprog = re.compile(r"""\d\d\d\d\d\d\d\d\d\d(\d\d)(\d\d).+""", re.I | re.X) m = reprog.match(ocr_time) g = m.groups() start_time_minute = int(g[0]) start_time_second = int(g[1]) time_range_left_min = max([start_time_minute - 3, 0]) time_range_right_min = min([start_time_minute + 3, 59]) ocr_words = cleansing.clean(ocr_results) ocr_words = [w for w in ocr_words if len(w) > 2] if len(ocr_words) < 1: return -1 overlap = 0 count = 0 for story in stories: if time_overlap(time_range_left_min, time_range_right_min, start_time_second, story): #print count p = words_overlap_percentage(ocr_words, story) #print ocr_words[0] #print p if p > overlap: overlap = p story_id = count break count += 1 if overlap >= 0.5: return story_id else: return -1
def learn_triplets_cooccur_mat(file_in, co_mat_file): learned_co_mat = CooccurMatrix() learned_co_mat.load(co_mat_file) np_voc = learned_co_mat.vocabulary np1_matrix = learned_co_mat.matrix np1_all = vocabulary.Vocabulary() with open(file_in, 'r') as f: for line in f: if(line[0] != '<'): line = (line[:-2]).lower() triplets = line.split('|') np1 = cleansing.clean(triplets[0].split()) # Delete words not in the similarity vocabulary. np1_new = [w for w in np1 if np_voc.contain(w)] for w in np1_new: np1_all.add(w) num_np1 = np1_all.size() similarity_mat_np1 = zeros([num_np1, num_np1]) for i in range(num_np1): for j in range(num_np1): similarity_mat_np1[i, j] = np1_matrix[np_voc.get_word_index(np1_all.get_word(i)), np_voc.get_word_index(np1_all.get_word(j))] return CooccurMatrix(similarity_mat_np1, np1_all)
def generate_triplets_voc(files): triplet_voc = triplet_vocabulary.TripletVocabulary() for file_in in files: with open(file_in, 'r') as f: for line in f: if(line[0] != '<'): line = (line[:-2]).lower() triplets = line.split('|') # remove stop words and save NP1, NP2 -> np, VP -> vp np1 = cleansing.clean(triplets[0].split()) for w in np1: triplet_voc.add_np(w) vp = cleansing.clean(triplets[1].split()) for w in vp: triplet_voc.add_vp(w) np2 = cleansing.clean(triplets[2].split()) for w in np2: triplet_voc.add_vp(w) return triplet_voc
def generate_triplets_voc(files): triplet_voc = triplet_vocabulary.TripletVocabulary() for file_in in files: with open(file_in, 'r') as f: for line in f: if (line[0] != '<'): line = (line[:-2]).lower() triplets = line.split('|') # remove stop words and save NP1, NP2 -> np, VP -> vp np1 = cleansing.clean(triplets[0].split()) for w in np1: triplet_voc.add_np(w) vp = cleansing.clean(triplets[1].split()) for w in vp: triplet_voc.add_vp(w) np2 = cleansing.clean(triplets[2].split()) for w in np2: triplet_voc.add_vp(w) return triplet_voc
def build_vocabulary(input_triplet_files, word_type='ALL'): vocab = vocabulary.Vocabulary() for triplet_file in input_triplet_files: with open(triplet_file, 'r') as f: for line in f: if(line[0] != '<'): line = (line[:-2]).lower() triplets = line.split('|') words = [] if (word_type == 'NP1' or word_type == 'ALL'): words.extend(cleansing.clean(triplets[0].split())) if (word_type == 'VP' or word_type == 'ALL'): words.extend(cleansing.clean(triplets[1].split())) if (word_type == 'NP2' or word_type == 'ALL'): words.extend(cleansing.clean(triplets[2].split())) for w in words: vocab.add(w) logging.info('Vocabulary: {0}, {1}.'.format(word_type, vocab.size())) return vocab
def build_vocabulary(input_triplet_files, word_type='ALL'): vocab = vocabulary.Vocabulary() for triplet_file in input_triplet_files: with open(triplet_file, 'r') as f: for line in f: if (line[0] != '<'): line = (line[:-2]).lower() triplets = line.split('|') words = [] if (word_type == 'NP1' or word_type == 'ALL'): words.extend(cleansing.clean(triplets[0].split())) if (word_type == 'VP' or word_type == 'ALL'): words.extend(cleansing.clean(triplets[1].split())) if (word_type == 'NP2' or word_type == 'ALL'): words.extend(cleansing.clean(triplets[2].split())) for w in words: vocab.add(w) logging.info('Vocabulary: {0}, {1}.'.format(word_type, vocab.size())) return vocab
def add_line(self, line): words = cleansing.clean(line.split()) for w in words: self.line_list.append(w)