def put_it_together(cur_image, caption, context, extracted_image_data, line_index, \ lines): """ Takes the current image(s) and caption(s) and assembles them into something useful in the extracted_image_data list. @param: cur_image (string || list): the image currently being dealt with, or the list of images, in the case of subimages @param: caption (string || list): the caption or captions currently in scope @param: extracted_image_data ([(string, string), (string, string), ...]): a list of tuples of images matched to captions from this document. @param: line_index (int): the index where we are in the lines (for searchback and searchforward purposes) @param: lines ([string, string, ...]): the lines in the TeX @return: (cur_image, caption, extracted_image_data): the same arguments it was sent, processed appropriately """ if type(cur_image) == list: if cur_image[MAIN_CAPTION_OR_IMAGE] == 'ERROR': cur_image[MAIN_CAPTION_OR_IMAGE] = '' for image in cur_image[SUB_CAPTION_OR_IMAGE]: if image == 'ERROR': cur_image[SUB_CAPTION_OR_IMAGE].remove(image) if cur_image != '' and caption != '': if type(cur_image) == list and type(caption) == list: if cur_image[MAIN_CAPTION_OR_IMAGE] != '' and\ caption[MAIN_CAPTION_OR_IMAGE] != '': extracted_image_data.append( (cur_image[MAIN_CAPTION_OR_IMAGE], caption[MAIN_CAPTION_OR_IMAGE], context)) if type(cur_image[MAIN_CAPTION_OR_IMAGE]) == list: # why is the main image a list? # it's a good idea to attach the main caption to other # things, but the main image can only be used once cur_image[MAIN_CAPTION_OR_IMAGE] = '' if type(cur_image[SUB_CAPTION_OR_IMAGE]) == list: if type(caption[SUB_CAPTION_OR_IMAGE]) == list: for index in \ range(len(cur_image[SUB_CAPTION_OR_IMAGE])): if index < len(caption[SUB_CAPTION_OR_IMAGE]): long_caption = \ caption[MAIN_CAPTION_OR_IMAGE] + ' : ' + \ caption[SUB_CAPTION_OR_IMAGE][index] else: long_caption = \ caption[MAIN_CAPTION_OR_IMAGE] + ' : ' + \ 'Caption not extracted' extracted_image_data.append( (cur_image[SUB_CAPTION_OR_IMAGE][index], long_caption, context)) else: long_caption = caption[MAIN_CAPTION_OR_IMAGE] + \ ' : ' + caption[SUB_CAPTION_OR_IMAGE] for sub_image in cur_image[SUB_CAPTION_OR_IMAGE]: extracted_image_data.append( (sub_image, long_caption, context)) else: if type(caption[SUB_CAPTION_OR_IMAGE]) == list: long_caption = caption[MAIN_CAPTION_OR_IMAGE] for sub_cap in caption[SUB_CAPTION_OR_IMAGE]: long_caption = long_caption + ' : ' + sub_cap extracted_image_data.append( (cur_image[SUB_CAPTION_OR_IMAGE], long_caption, context)) else: #wtf are they lists for? extracted_image_data.append( (cur_image[SUB_CAPTION_OR_IMAGE], caption[SUB_CAPTION_OR_IMAGE], context)) elif type(cur_image) == list: if cur_image[MAIN_CAPTION_OR_IMAGE] != '': extracted_image_data.append( (cur_image[MAIN_CAPTION_OR_IMAGE], caption, context)) if type(cur_image[SUB_CAPTION_OR_IMAGE]) == list: for image in cur_image[SUB_CAPTION_OR_IMAGE]: extracted_image_data.append((image, caption, context)) else: extracted_image_data.append( (cur_image[SUB_CAPTION_OR_IMAGE], caption, context)) elif type(caption) == list: if caption[MAIN_CAPTION_OR_IMAGE] != '': extracted_image_data.append( (cur_image, caption[MAIN_CAPTION_OR_IMAGE], context)) if type(caption[SUB_CAPTION_OR_IMAGE]) == list: # multiple caps for one image: long_caption = caption[MAIN_CAPTION_OR_IMAGE] for subcap in caption[SUB_CAPTION_OR_IMAGE]: if long_caption != '': long_caption += ' : ' long_caption += subcap extracted_image_data.append((cur_image, long_caption, context)) else: extracted_image_data.append( (cur_image, caption[SUB_CAPTION_OR_IMAGE]. context)) else: extracted_image_data.append((cur_image, caption, context)) elif cur_image != '' and caption == '': # we may have missed the caption somewhere. REASONABLE_SEARCHBACK = 25 REASONABLE_SEARCHFORWARD = 5 curly_no_tag_preceding = '(?<!\\w){' for searchback in range(REASONABLE_SEARCHBACK): if line_index - searchback < 0: continue back_line = lines[line_index - searchback] m = re.search(curly_no_tag_preceding, back_line) if m != None: open_curly = m.start() open_curly, open_curly_line, close_curly, \ close_curly_line = find_open_and_close_braces(\ line_index - searchback, open_curly, '{', lines) cap_begin = open_curly + 1 caption = assemble_caption(open_curly_line, cap_begin, \ close_curly_line, close_curly, lines) if type(cur_image) == list: extracted_image_data.append( (cur_image[MAIN_CAPTION_OR_IMAGE], caption, context)) for sub_img in cur_image[SUB_CAPTION_OR_IMAGE]: extracted_image_data.append((sub_img, caption, context)) else: extracted_image_data.append((cur_image, caption, context)) break if caption == '': for searchforward in range(REASONABLE_SEARCHFORWARD): if line_index + searchforward >= len(lines): break fwd_line = lines[line_index + searchforward] m = re.search(curly_no_tag_preceding, fwd_line) if m != None: open_curly = m.start() open_curly, open_curly_line, close_curly, \ close_curly_line = find_open_and_close_braces(\ line_index + searchforward, open_curly, '{', lines) cap_begin = open_curly + 1 caption = assemble_caption(open_curly_line, \ cap_begin, close_curly_line, close_curly, lines) if type(cur_image) == list: extracted_image_data.append( (cur_image[MAIN_CAPTION_OR_IMAGE], caption, context)) for sub_img in cur_image[SUB_CAPTION_OR_IMAGE]: extracted_image_data.append((sub_img, caption, context)) else: extracted_image_data.append((cur_image, caption, context)) break if caption == '': if type(cur_image) == list: extracted_image_data.append( (cur_image[MAIN_CAPTION_OR_IMAGE], 'No caption found', context)) for sub_img in cur_image[SUB_CAPTION_OR_IMAGE]: extracted_image_data.append((sub_img, 'No caption', context)) else: extracted_image_data.append( (cur_image, 'No caption found', context)) elif caption != '' and cur_image == '': if type(caption) == list: long_caption = caption[MAIN_CAPTION_OR_IMAGE] for subcap in caption[SUB_CAPTION_OR_IMAGE]: long_caption = long_caption + ': ' + subcap else: long_caption = caption extracted_image_data.append(('', 'noimg' + long_caption, context)) # if we're leaving the figure, no sense keeping the data cur_image = '' caption = '' return (cur_image, caption, extracted_image_data)
def extract_captions(tex_file, sdir, image_list, primary=True): """ Take the TeX file and the list of images in the tarball (which all, presumably, are used in the TeX file) and figure out which captions in the text are associated with which images @param: lines (list): list of lines of the TeX file @param: tex_file (string): the name of the TeX file which mentions the images @param: sdir (string): path to current sub-directory @param: image_list (list): list of images in tarball @param: primary (bool): is this the primary call to extract_caption? @return: images_and_captions_and_labels ([(string, string, list), (string, string, list), ...]): a list of tuples representing the names of images and their corresponding figure labels from the TeX file """ if os.path.isdir(tex_file) or not os.path.exists(tex_file): return [] fd = open(tex_file) lines = fd.readlines() fd.close() # possible figure lead-ins figure_head = '\\begin{figure' # also matches figure* figure_tail = '\\end{figure' # also matches figure* picture_head = '\\begin{picture}' displaymath_head = '\\begin{displaymath}' subfloat_head = '\\subfloat' subfig_head = '\\subfigure' includegraphics_head = '\\includegraphics' epsfig_head = '\\epsfig' input_head = '\\input' # possible caption lead-ins caption_head = '\\caption' figcaption_head = '\\figcaption' label_head = '\\label' rotate = 'rotate=' angle = 'angle=' eps_tail = '.eps' ps_tail = '.ps' doc_head = '\\begin{document}' doc_tail = '\\end{document}' extracted_image_data = [] cur_image = '' caption = '' labels = [] active_label = "" # cut out shit before the doc head if primary: for line_index in range(len(lines)): if lines[line_index].find(doc_head) < 0: lines[line_index] = '' else: break # are we using commas in filenames here? commas_okay = False for dummy1, dummy2, filenames in \ os.walk(os.path.split(os.path.split(tex_file)[0])[0]): for filename in filenames: if filename.find(',') > -1: commas_okay = True break # a comment is a % not preceded by a \ comment = re.compile("(?<!\\\\)%") for line_index in range(len(lines)): # get rid of pesky comments by splitting where the comment is # and keeping only the part before the % line = comment.split(lines[line_index])[0] line = line.strip() lines[line_index] = line in_figure_tag = 0 for line_index in range(len(lines)): line = lines[line_index] if line == '': continue if line.find(doc_tail) > -1: return extracted_image_data """ FIGURE - structure of a figure: \begin{figure} \formatting... \includegraphics[someoptions]{FILENAME} \caption{CAPTION} %caption and includegraphics may be switched! \end{figure} """ index = line.find(figure_head) if index > -1: in_figure_tag = 1 # some punks don't like to put things in the figure tag. so we # just want to see if there is anything that is sitting outside # of it when we find it cur_image, caption, extracted_image_data = \ put_it_together(cur_image, caption, active_label, extracted_image_data, \ line_index, lines) # here, you jerks, just make it so that it's fecking impossible to # figure out your damn inclusion types index = max([line.find(eps_tail), line.find(ps_tail), \ line.find(epsfig_head)]) if index > -1: if line.find(eps_tail) > -1 or line.find(ps_tail) > -1: ext = True else: ext = False filenames = intelligently_find_filenames(line, ext=ext, commas_okay=commas_okay) # try to look ahead! sometimes there are better matches after if line_index < len(lines) - 1: filenames.extend(\ intelligently_find_filenames(lines[line_index + 1], commas_okay=commas_okay)) if line_index < len(lines) - 2: filenames.extend(\ intelligently_find_filenames(lines[line_index + 2], commas_okay=commas_okay)) for filename in filenames: filename = str(filename) if cur_image == '': cur_image = filename elif type(cur_image) == list: if type(cur_image[SUB_CAPTION_OR_IMAGE]) == list: cur_image[SUB_CAPTION_OR_IMAGE].append(filename) else: cur_image[SUB_CAPTION_OR_IMAGE] = [filename] else: cur_image = ['', [cur_image, filename]] """ Rotate and angle """ index = max(line.find(rotate), line.find(angle)) if index > -1: # which is the image associated to it? filenames = intelligently_find_filenames(line, commas_okay=commas_okay) # try the line after and the line before if line_index + 1 < len(lines): filenames.extend(intelligently_find_filenames(lines[line_index + 1], commas_okay=commas_okay)) if line_index > 1: filenames.extend(intelligently_find_filenames(lines[line_index - 1], commas_okay=commas_okay)) already_tried = [] for filename in filenames: if filename != 'ERROR' and not filename in already_tried: if rotate_image(filename, line, sdir, image_list): break already_tried.append(filename) """ INCLUDEGRAPHICS - structure of includegraphics: \includegraphics[someoptions]{FILENAME} """ index = line.find(includegraphics_head) if index > -1: open_curly, open_curly_line, close_curly, dummy = \ find_open_and_close_braces(line_index, index, '{', lines) filename = lines[open_curly_line][open_curly + 1:close_curly] if cur_image == '': cur_image = filename elif type(cur_image) == list: if type(cur_image[SUB_CAPTION_OR_IMAGE]) == list: cur_image[SUB_CAPTION_OR_IMAGE].append(filename) else: cur_image[SUB_CAPTION_OR_IMAGE] = [filename] else: cur_image = ['', [cur_image, filename]] """ {\input{FILENAME}} \caption{CAPTION} This input is ambiguous, since input is also used for things like inclusion of data from other LaTeX files directly. """ index = line.find(input_head) if index > -1: new_tex_names = intelligently_find_filenames(line, TeX=True, \ commas_okay=commas_okay) for new_tex_name in new_tex_names: if new_tex_name != 'ERROR': new_tex_file = get_tex_location(new_tex_name, tex_file) if new_tex_file != None and primary: #to kill recursion extracted_image_data.extend(extract_captions(\ new_tex_file, sdir, \ image_list, primary=False)) """PICTURE""" index = line.find(picture_head) if index > -1: # structure of a picture: # \begin{picture} # ....not worrying about this now #write_message('found picture tag') #FIXME pass """DISPLAYMATH""" index = line.find(displaymath_head) if index > -1: # structure of a displaymath: # \begin{displaymath} # ....not worrying about this now #write_message('found displaymath tag') #FIXME pass """ CAPTIONS - structure of a caption: \caption[someoptions]{CAPTION} or \caption{CAPTION} or \caption{{options}{CAPTION}} """ index = max([line.find(caption_head), line.find(figcaption_head)]) if index > -1: open_curly, open_curly_line, close_curly, close_curly_line = \ find_open_and_close_braces(line_index, index, '{', lines) cap_begin = open_curly + 1 cur_caption = assemble_caption(open_curly_line, cap_begin, \ close_curly_line, close_curly, lines) if caption == '': caption = cur_caption elif type(caption) == list: if type(caption[SUB_CAPTION_OR_IMAGE]) == list: caption[SUB_CAPTION_OR_IMAGE].append(cur_caption) else: caption[SUB_CAPTION_OR_IMAGE] = [cur_caption] elif caption != cur_caption: caption = ['', [caption, cur_caption]] """ SUBFLOATS - structure of a subfloat (inside of a figure tag): \subfloat[CAPTION]{options{FILENAME}} also associated with the overall caption of the enclosing figure """ index = line.find(subfloat_head) if index > -1: # if we are dealing with subfloats, we need a different # sort of structure to keep track of captions and subcaptions if type(cur_image) != list: cur_image = [cur_image, []] if type(caption) != list: caption = [caption, []] open_square, open_square_line, close_square, close_square_line = \ find_open_and_close_braces(line_index, index, '[', lines) cap_begin = open_square + 1 sub_caption = assemble_caption(open_square_line, \ cap_begin, close_square_line, close_square, lines) caption[SUB_CAPTION_OR_IMAGE].append(sub_caption) open_curly, open_curly_line, close_curly, dummy = \ find_open_and_close_braces(close_square_line, \ close_square, '{', lines) sub_image = lines[open_curly_line][open_curly + 1:close_curly] cur_image[SUB_CAPTION_OR_IMAGE].append(sub_image) """ SUBFIGURES - structure of a subfigure (inside a figure tag): \subfigure[CAPTION]{ \includegraphics[options]{FILENAME}} also associated with the overall caption of the enclosing figure """ index = line.find(subfig_head) if index > -1: # like with subfloats, we need a different structure for keepin # track of this stuff if type(cur_image) != list: cur_image = [cur_image, []] if type(caption) != list: caption = [caption, []] open_square, open_square_line, close_square, close_square_line = \ find_open_and_close_braces(line_index, index, '[', lines) cap_begin = open_square + 1 sub_caption = assemble_caption(open_square_line, \ cap_begin, close_square_line, close_square, lines) caption[SUB_CAPTION_OR_IMAGE].append(sub_caption) index_cpy = index # find the graphics tag to get the filename # it is okay if we eat lines here index = line.find(includegraphics_head) while index == -1 and (line_index + 1) < len(lines): line_index = line_index + 1 line = lines[line_index] index = line.find(includegraphics_head) if line_index == len(lines): # didn't find the image name on line line_index = index_cpy open_curly, open_curly_line, close_curly, dummy = \ find_open_and_close_braces(line_index, \ index, '{', lines) sub_image = lines[open_curly_line][open_curly + 1:close_curly] cur_image[SUB_CAPTION_OR_IMAGE].append(sub_image) """ LABELS - structure of a label: \label{somelabelnamewhichprobablyincludesacolon} Labels are used to tag images and will later be used in ref tags to reference them. This is interesting because in effect the refs to a plot are additional caption for it. Notes: labels can be used for many more things than just plots. We'll have to experiment with how to best associate a label with an image.. if it's in the caption, it's easy. If it's in a figure, it's still okay... but the images that aren't in figure tags are numerous. """ index = line.find(label_head) if index > -1 and in_figure_tag: open_curly, open_curly_line, close_curly, dummy = \ find_open_and_close_braces(line_index, \ index, '{', lines) label = lines[open_curly_line][open_curly + 1:close_curly] if label not in labels: active_label = label labels.append(label) """ FIGURE important: we put the check for the end of the figure at the end of the loop in case some pathological person puts everything in one line """ index = max([line.find(figure_tail), line.find(doc_tail)]) if index > -1: in_figure_tag = 0 cur_image, caption, extracted_image_data = \ put_it_together(cur_image, caption, active_label, extracted_image_data, \ line_index, lines) """ END DOCUMENT we shouldn't look at anything after the end document tag is found """ index = line.find(doc_tail) if index > -1: break return extracted_image_data