def ocr_find_story(ocr_results, ocr_time, stories): reprog = re.compile(r"""\d\d\d\d\d\d\d\d\d\d(\d\d)(\d\d).+""", re.I | re.X) m = reprog.match(ocr_time) g = m.groups() start_time_minute = int(g[0]) start_time_second = int(g[1]) time_range_left_min = max([start_time_minute - 3, 0]) time_range_right_min = min([start_time_minute + 3, 59]) ocr_words = cleansing.clean(ocr_results) ocr_words = [w for w in ocr_words if len(w) > 2] if len(ocr_words) < 1: return -1 overlap = 0 for i, story in enumerate(stories): if time_overlap(time_range_left_min, time_range_right_min, start_time_second, story): #print count p = words_overlap_percentage(ocr_words, story) #print ocr_words[0] #print p if p > overlap: overlap = p story_id = i if overlap >= 0.4: return story_id else: return -1
def _read_caption_and_clean(captionfile): words = [] with codecs.open(captionfile, 'r', encoding='ISO-8859-1') as f: for line in f: if line[0] == '2' or line[0:3] in g_content_tags: content = line.split('|')[-1] words.extend(content.split()) return cleansing.clean(words)
def read_testing_file(filenameprefix): """Read the triplets files of the segments that correspond to the test file""" file_name = 'data/transformed_triplet_files/' + filenameprefix + '*.txt' files = glob.glob(file_name) files.sort() true_segment = [] all_sentences = [] line_count_total = 0 for segments_file in files: # Delete the teaser files if (segments_file.split('/')[-1].split('_')[-1].split('|')[0].split( ':')[-1] == 'Teaser' or segments_file.split('/')[-1].split('_')[-1].split('.')[0] == 'NULL'): continue line_count = -1 current_seg_sentences = [] with open(segments_file, 'r') as f: for line in f: if (line[0] != '<'): line_count += 1 line = (line[:-2]).lower() triplets = line.split('|') np1 = triplets[0].split() if np1 != [] and _is_pronoun(np1[0]): pronoun_flag = True else: pronoun_flag = False np1 = cleansing.clean(triplets[0].split()) vp = cleansing.clean(triplets[1].split()) np2 = cleansing.clean(triplets[2].split()) current_seg_sentences.append( Sentence(np1, vp, np2, pronoun_flag)) # Only keep segments longer than 5 sentences segment_length = len(current_seg_sentences) if (segment_length > 5): seg = [(sid + line_count_total) for sid in range(0, segment_length)] true_segment.append(set(seg)) all_sentences.extend(current_seg_sentences) line_count_total += segment_length return [all_sentences, true_segment]
def read_testing_file(filenameprefix): """Read the triplets files of the segments that correspond to the test file""" file_name = 'data/transformed_triplet_files/' + filenameprefix + '*.txt' files = glob.glob(file_name) files.sort() true_segment = [] all_sentences = [] line_count_total = 0 for segments_file in files: # Delete the teaser files if (segments_file.split('/')[-1].split('_')[-1].split('|')[0].split(':')[-1] == 'Teaser' or segments_file.split('/')[-1].split('_')[-1].split('.')[0] == 'NULL'): continue line_count = -1 current_seg_sentences = [] with open(segments_file, 'r') as f: for line in f: if(line[0] != '<'): line_count += 1 line = (line[:-2]).lower() triplets = line.split('|') np1 = triplets[0].split() if np1 != [] and _is_pronoun(np1[0]): pronoun_flag = True else: pronoun_flag = False np1 = cleansing.clean(triplets[0].split()) vp = cleansing.clean(triplets[1].split()) np2 = cleansing.clean(triplets[2].split()) current_seg_sentences.append(Sentence(np1, vp, np2, pronoun_flag)) # Only keep segments longer than 5 sentences segment_length = len(current_seg_sentences) if (segment_length > 5): seg = [(sid + line_count_total) for sid in range(0, segment_length)] true_segment.append(set(seg)) all_sentences.extend(current_seg_sentences) line_count_total += segment_length return [all_sentences, true_segment]
def read_triplet_file(triplet_filename, use_ocr=False): ocr_file = None np1_words = [] vp_words = [] np2_words = [] count = 0 with open(triplet_filename, 'r') as f: for line in f: if (line[0] != '<'): count += 1 line = (line[:-2]).lower() triplets = line.split('|') np1_words.extend(cleansing.clean(triplets[0].split())) if len(triplets) == 3: vp_words.extend(cleansing.clean(triplets[1].split())) np2_words.extend(cleansing.clean(triplets[2].split())) #if count < 10: # return OrignalDocument('', '', [], [], [], []) ocr_words = [] if use_ocr: #name_tmp = triplet_filename.split('&')[-2] + '_' + triplet_filename.split('&')[-1].replace('.txt', '.ocr') #ocr_file = 'data/ocr_result_080819-081015/' + name_tmp.lower() name_tmp = triplet_filename[:-4].split('/')[-1] ocr_file = 'data/ocr_result_ori/' + name_tmp.lower() if os.path.exists(ocr_file): with open(ocr_file, 'r') as f: for line in f: if (line[0] != '<'): line = (line[:-2]).lower() ocr_words.extend(cleansing.clean(line.split())) #timestamp = datetime.datetime.strptime((triplet_filename.split('/')[-1]).split('&')[1].split('.')[0], '%Y%m%d%H%M%S') #name_tmp = triplet_filename.split('/')[-1][:-4].split('&') #filename = name_tmp[1] + '&' + name_tmp[2] timestamp = datetime.datetime.strptime( (triplet_filename.split('/')[-1]).split('_')[0].split('.')[0], '%Y%m%d%H%M%S') filename = triplet_filename.split('/')[-1][:-4] return OrignalDocument(filename, timestamp, np1_words, vp_words, np2_words, ocr_words)
def read_triplet_file(triplet_filename, use_ocr=False): ocr_file = None np1_words = [] vp_words = [] np2_words = [] count = 0 with open(triplet_filename, 'r') as f: for line in f: if(line[0] != '<'): count += 1 line = (line[:-2]).lower() triplets = line.split('|') np1_words.extend(cleansing.clean(triplets[0].split())) if len(triplets) == 3: vp_words.extend(cleansing.clean(triplets[1].split())) np2_words.extend(cleansing.clean(triplets[2].split())) #if count < 10: # return OrignalDocument('', '', [], [], [], []) ocr_words = [] if use_ocr: #name_tmp = triplet_filename.split('&')[-2] + '_' + triplet_filename.split('&')[-1].replace('.txt', '.ocr') #ocr_file = 'data/ocr_result_080819-081015/' + name_tmp.lower() name_tmp = triplet_filename[:-4].split('/')[-1] ocr_file = 'data/ocr_result_ori/' + name_tmp.lower() if os.path.exists(ocr_file): with open(ocr_file, 'r') as f: for line in f: if(line[0] != '<'): line = (line[:-2]).lower() ocr_words.extend(cleansing.clean(line.split())) #timestamp = datetime.datetime.strptime((triplet_filename.split('/')[-1]).split('&')[1].split('.')[0], '%Y%m%d%H%M%S') #name_tmp = triplet_filename.split('/')[-1][:-4].split('&') #filename = name_tmp[1] + '&' + name_tmp[2] timestamp = datetime.datetime.strptime((triplet_filename.split('/')[-1]).split('_')[0].split('.')[0], '%Y%m%d%H%M%S') filename = triplet_filename.split('/')[-1][:-4] return OrignalDocument(filename, timestamp, np1_words, vp_words, np2_words, ocr_words)
def filter_story(filename): filtered_words = [] # read in story with codecs.open(filename, 'r', encoding='ISO-8859-1') as f: for line in f: line = line.lower() # only split and filter content lines # TAG|timestamp|conent parts = line.split('|') if (parts[0] in CONTENT_LINE_TAG): word_list = parts[-1].split(' ') word_list = cleansing.clean(word_list) filtered_words.extend(word_list) # write the cleaned story to disk filterd_story = ' '.join(filtered_words) parts = filename.split('/') newfilename = '/dataset/08cleaned/' + parts[-1] with codecs.open(newfilename + '.txt', 'w', encoding='ISO-8859-1') as f: f.write(filterd_story)
def add_line(self, line): words = cleansing.clean(line.split()) for w in words: self.line_list.append(w)