def get_related_figures(identifiers_dir, text_data_dir, embeddings_dir, test_identifiers_dir, output_dir): """Get semantically related figures for a set of test figures. Args: identifiers_dir: (string) identifiers of all figures in the collection. text_data_dir: (string) the file with the text for each figure (for keyword retrieval purposes). embeddings_dir: (string) the embedding vectors for all figures in the collection. test_identifiers_dir: (string) the figures for which we want to find related figures (a subset of full collection) output_dir: (string) directory for the output data. Returns: None. Outputs the related figures to a file. """ test_identifiers = utils.read_lines_from_file(test_identifiers_dir) all_identifiers = utils.read_lines_from_file(identifiers_dir) tf_idf_matrix = KnnSearcher.get_tf_idf_embeddings(text_data_dir) searcher = KnnSearcher(tf_idf_matrix, all_identifiers, 100) initial_result_list = searcher.perform_keyword_retrieval(test_identifiers) embedding_matrix = utils.load_embeddings(embeddings_dir) final_result_list = re_rank_with_embeddings(initial_result_list, embedding_matrix, all_identifiers) with open(output_dir, 'w+') as output_file: for figure_id in final_result_list: line = figure_id for other_figure in final_result_list[figure_id]: if other_figure[0] != figure_id: line += ',' + other_figure[0] output_file.write(line + '\n')
def get_tf_idf_vectors(data_dir, ids_dir, min_df=5): ids = utils.read_lines_from_file(ids_dir) count_vector = CountVectorizer(min_df=min_df) tf_idf_transformer = TfidfTransformer() data_lines = utils.read_lines_from_file(data_dir) tf_vectors = count_vector.fit_transform(data_lines) tf_idf_vectors = tf_idf_transformer.fit_transform(tf_vectors) return tf_idf_vectors, ids
def parse(self): grub_lines = utils.read_lines_from_file(self.menu) block_num = 0 last_title = "" block_value = [] for line in grub_lines: if line[:5] == "title": block_num += 1 block_value = [] last_title = str(block_num) block_value.append((line[:5], line[6:])) self.title_blocks[str(block_num)] = block_value continue if line[:6] == "kernel": self.title_blocks[last_title].append((line[:6], line[7:])) continue if line[:6] == "initrd": self.title_blocks[last_title].append((line[:6], line[7:])) continue if line[:6] == "append": self.title_blocks[last_title].append((line[:6], line[7:])) continue
def parse(self): grub_lines = utils.read_lines_from_file(self.menu) block_num = 0 last_title = "" block_value = [] for line in grub_lines: if line[:5] == "title": block_num += 1 block_value = [] last_title = str(block_num) block_value.append((line[:5],line[6:])) self.title_blocks[str(block_num)] = block_value continue if line[:6] == "kernel": self.title_blocks[last_title].append((line[:6],line[7:])) continue if line[:6] == "initrd": self.title_blocks[last_title].append((line[:6],line[7:])) continue if line[:6] == "append": self.title_blocks[last_title].append((line[:6],line[7:])) continue
def __change_config(self, identifier, old_value, new_value): """Change the configuration file based on the passed values.""" line_number = self.__get_line_number(identifier) lines = utils.read_lines_from_file(self.menu) if line_number == -1: line_number = 0 lines.insert(0,"\n") line = lines[line_number] if old_value == None: lines[line_number] = new_value elif old_value == "vga=": place = line.find(old_value) end = line.find(" ", place) if place != -1: if end != -1: line = line[:place] + new_value + line[end:] else: line = line[:place] + new_value + "\n" else: line = line[:-1] + " " + new_value + "\n" lines[line_number] = line else: if old_value[0] != "#" and line.find(" " + old_value) != -1: old_value = " " + old_value if old_value[0] == "#" and line.find("# " + old_value[1:]) != -1: old_value = "# " + old_value[1:] line = line.replace(old_value, new_value) lines[line_number] = line utils.write_lines_to_file(self.menu, lines)
def __change_config(self, identifier, old_value, new_value): """Change the configuration file based on the passed values.""" line_number = self.__get_line_number(identifier) lines = utils.read_lines_from_file(self.menu) if line_number == -1: line_number = 0 lines.insert(0, "\n") line = lines[line_number] if old_value == None: lines[line_number] = new_value elif old_value == "vga=": place = line.find(old_value) end = line.find(" ", place) if place != -1: if end != -1: line = line[:place] + new_value + line[end:] else: line = line[:place] + new_value + "\n" else: line = line[:-1] + " " + new_value + "\n" lines[line_number] = line else: if old_value[0] != "#" and line.find(" " + old_value) != -1: old_value = " " + old_value if old_value[0] == "#" and line.find("# " + old_value[1:]) != -1: old_value = "# " + old_value[1:] line = line.replace(old_value, new_value) lines[line_number] = line utils.write_lines_to_file(self.menu, lines)
def get_figure_mentions_by_words(grobid_article_dir, figure_num, window_size, captions_dict, mentions_dict): """Get the text of figure mentions in the article using word windows. Args: grobid_article_dir: (string) the directory the processed Grobid file of the article. figure_num: (string) the figure number. window_size: (int) the number of words that should surround a figure mention. captions_dict: (dictionary) the captions of the figures. mentions_dict: (dictionary) the locations in the texts where the mentions are. Returns: (list). The merged representation. """ mentions_text = '\"...' caption_text = '' article_lines = utils.read_lines_from_file(grobid_article_dir) doc_txt = '' for line in article_lines: line = line.rstrip('\n') doc_txt += line + ' ' doc_txt = doc_txt.rstrip(' ') words = doc_txt.split(' ') if str(figure_num) in captions_dict.keys(): for caption_id in captions_dict[str(figure_num)]: if len(article_lines[caption_id]) < len(caption_text) or len( caption_text) == 0: caption_text = article_lines[caption_id] + ' ' caption_text = caption_text.rstrip(' ') caption_text = utils.process_caption(caption_text, figure_num) if str(figure_num) in mentions_dict.keys(): all_summary_ids = [] for word_id in mentions_dict[str(figure_num)]: summary_ids = [] for i in range(window_size, 0, -1): summary_ids += [word_id - i] summary_ids += [word_id] for i in range(1, window_size + 1): summary_ids += [word_id + i] all_summary_ids += [summary_ids] all_summary_ids = merge_texts(all_summary_ids) for summary_ids in all_summary_ids: for i in summary_ids: if i < 0 or i >= len(words): continue mentions_text += ' ' + words[i] mentions_text += ' ... ' mentions_text = mentions_text.rstrip(' ') + '\"' return caption_text, mentions_text
def __has_separate_boot_partition(self): """Try to find out if there is a separate boot partition. Return True if it is likely(If /etc/fstab has a line containing '/boot', that is) """ for line in utils.read_lines_from_file(self.fstab): if line.find("/boot") != -1: return True return False
def read_lines_from_cached_file_legacy(template_file): lines = CACHED_FILES.get(template_file) if lines is None: lines = read_lines_from_file(template_file) lines = re.sub(LEGACY_PLACEHOLDERS_UNESCAPED_REPLACEMENT_PATTERN, r'\1{{{\2}}}', lines) lines = re.sub(LEGACY_PLACEHOLDERS_REPLACEMENT_PATTERN, r'\1{{\2}}', lines) CACHED_FILES[template_file] = lines return lines
def extract_all_figures(all_figures_dir, count_threshold=5): """Generate textual representations to all figures in the collection. Args: all_figures_dir: (string) the folder with the textual fields for the figures (the files used to build an index). count_threshold: (int) number of words to consider a figure. Returns: (list, list, list). figure tokens, figure identifiers, figure_image_files. """ text_data, figure_identifiers, image_file_names = [], [], [] for i, article_file_name in enumerate(os.listdir(all_figures_dir)): figure_flag = False caption, mention, figure_num, image_file_name = '', '', '', '' for line in utils.read_lines_from_file(all_figures_dir + '/' + article_file_name): if '<figure>' in line: if figure_flag: tokens = text_to_tokens(caption + ' ' + mention) if len(tokens) > count_threshold: text_data.append(tokens) figure_identifiers.append( article_file_name.split('.')[0] + '_' + figure_num) image_file_names.append(image_file_name) figure_flag = False caption, mention, figure_num, image_file_name = '', '', '', '' if '<figure>' in line: figure_num = line.rstrip('\n').rstrip('</figure>').lstrip( '<figure>') if len(figure_num) == 2: figure_num = figure_num.replace('.', '') figure_flag = True if '<caption>' in line and figure_flag: caption = line.rstrip('\n').rstrip('</caption>').lstrip( '<caption>') if '<lines3>' in line and figure_flag: mention = line.rstrip('\n').rstrip('</lines3>').lstrip( '<lines3>') if '<file>' in line and figure_flag: image_file_name = line.rstrip('\n').rstrip('</file>').lstrip( '<file>') if figure_flag: tokens = text_to_tokens(caption + ' ' + mention) if len(tokens) > count_threshold: text_data.append(tokens) figure_identifiers.append( article_file_name.split('.')[0] + '_' + figure_num) image_file_names.append(image_file_name) return text_data, figure_identifiers, image_file_names
def parse_qrel_file(qrel_dir): qrel_dict = {} lines = utils.read_lines_from_file(qrel_dir, delimiter=' ') for line in lines: label = line[3] if label == '1': qid = line[0] doc = line[2] qrel_dict[qid] = qrel_dict.get(qid, []) + [doc] return qrel_dict
def __fix_menu_lst_bug(self): """Take care of an annoying thing in menu.lst.""" grub_lines = utils.read_lines_from_file(self.menu) password_lines = [] for line in grub_lines: line.strip() if line[0] == "#": line = line[1:].strip() if line[:8] == "password": password_lines.append(line) if len(password_lines) == 2: self.__change_config(password_lines[0], None, "## " + password_lines[0] + "\n")
def __get_line_number(self, identifier): """Return the number of the line that begin with the string identifier. """ lines = utils.read_lines_from_file(self.menu) length = len(identifier) tracker = 0 for line in lines: if line.strip()[:1] == "#": line = line.strip()[1:] line = line.strip()[:length].strip() if line == identifier: return tracker tracker += 1 return -1
def get_tf_idf_embeddings(data_dir): """Get tf-idf matrix of the figures in the collection. Args: data_dir: (string) a file with the text data of the figures. Returns: tf.idf Matrix. """ data_lines = utils.read_lines_from_file(data_dir) count_vector = CountVectorizer() tf_vectors = count_vector.fit_transform(data_lines) tf_idf_transformer = TfidfTransformer() tf_idf_vectors = tf_idf_transformer.fit_transform(tf_vectors) return tf_idf_vectors
def get_captions(grobid_article_dir): """Get the captions of all figures from the Grobid processed files. Args: grobid_article_dir: (string) the directory the processed grobid file of the article. Returns: (dictionary). Mapping from figure identifiers to their caption. """ figure_caption_dict = {} article_lines = utils.read_lines_from_file(grobid_article_dir) fig_captions_flag = False for line_id, line in enumerate(article_lines): if '<figcaptions>' in line: fig_captions_flag = True continue if '<title>' in line: break if not fig_captions_flag: continue words = utils.process_line(line) for i, word in enumerate(words): figure_flag = False for fig_format in figure_formats: if fig_format in word.lower(): figure_flag = True if not figure_flag: continue figure_num = utils.extract_number(words[i]) try: int(figure_num) except: if len(words) > i + 1: figure_num = utils.extract_number(words[i + 1]) else: continue if figure_num not in figure_caption_dict.keys(): figure_caption_dict[figure_num] = [] if line_id not in figure_caption_dict[figure_num]: figure_caption_dict[figure_num] += [line_id] return figure_caption_dict
def get_article_field(grobid_article_dir, field_name): """Get a specific article field of the figure's article. Args: grobid_article_dir: (string) the directory the processed Grobid file of the article. field_name: (string) the field name such as 'introduction' Returns: (string). The content of the textual field. """ output = '' predicate = False lines = utils.read_lines_from_file(grobid_article_dir) for line in lines: if predicate: output = line.rstrip('\n') predicate = False if '<' + field_name + '>' in line: predicate = True return output
def get_figure_ids_list(grobid_article_dir): """Get the list of all figure identifiers in a single article. Args: grobid_article_dir: (string) the directory the processed grobid file of the article. Returns: (list). The list of figure identifiers (numbers stored as strings). """ figure_list = [] doc_txt = '' lines = utils.read_lines_from_file(grobid_article_dir) for line in lines: line = line.rstrip('\n') doc_txt += line + ' ' doc_txt = doc_txt.rstrip(' ') words = doc_txt.split(' ') for word_id, word in enumerate(words): word = utils.process_line(word) if len(word) == 0: continue word = word[0] if word.split('.')[0].lower() in figure_formats: if len(word.split('.')) > 1: word_length = len(word.split('.')) figure_num = utils.extract_number('.'.join( word.split('.')[1:word_length])) elif len(words) > word_id + 1: figure_num = utils.extract_number(words[word_id + 1]) else: continue if figure_num not in figure_list and figure_num != '': figure_list += [figure_num] return figure_list
def __check_themes(self): """Check for installed themes, send info to class variables.""" self.grub_images = [] if self.__get_line_number("splash") != -1: grub_lines = utils.read_lines_from_file(self.menu) location = self.__get_line_number("splash") splash_line = grub_lines[location].strip() correct_path = splash_line.find(self.groot + self.mounted_splashdir) if correct_path != -1: location = splash_line.rfind('/') file_name = splash_line[location + 1:].replace('\\', '') suffix = file_name.find(".xpm.gz") if suffix != -1: name = file_name[:suffix] for dirfile in os.listdir(self.mounted_splashdir): if file_name == dirfile: self.default_image = name grub_files = os.listdir(self.mounted_splashdir) for grub_file in grub_files: end = grub_file[grub_file[:-3].rfind("."):] if end == ".xpm.gz": self.grub_images.append(grub_file[:-7])
def md2html(document, plugins, metadata_handlers, options): input_location = document['input_file'] output_location = document['output_file'] title = document['title'] template_file = document['template'] link_css = document['link_css'] include_css = document['include_css'] force = document['force'] verbose = document['verbose'] report = document['report'] output_file = Path(output_location) input_file = Path(input_location) if not force and output_file.exists(): output_file_mtime = os.path.getmtime(output_file) input_file_mtime = os.path.getmtime(input_file) if output_file_mtime > input_file_mtime: if verbose: print( f'The output file is up-to-date. Skipping: {output_location}' ) return current_time = datetime.today() substitutions = { 'title': title, 'exec_name': EXEC_NAME, 'exec_version': EXEC_VERSION, 'generation_date': current_time.strftime('%Y-%m-%d'), 'generation_time': current_time.strftime('%H:%M:%S') } styles = [] if link_css: styles.extend([ f'<link rel="stylesheet" type="text/css" href="{item}">' for item in link_css ]) if include_css: styles.extend([ '<style>\n' + read_lines_from_file(item) + '\n</style>' for item in include_css ]) substitutions['styles'] = '\n'.join(styles) if styles else '' md_lines = read_lines_from_file(input_file) for plugin in plugins: plugin.new_page() md_lines = apply_metadata_handlers(md_lines, metadata_handlers, document) substitutions['content'] = MARKDOWN.convert(source=md_lines) for plugin in plugins: substitutions.update(plugin.variables(document)) if options['legacy_mode']: placeholders = substitutions.get('placeholders') if placeholders is not None: del substitutions['placeholders'] substitutions.update(placeholders) template = read_lines_from_cached_file_legacy(template_file) else: template = read_lines_from_cached_file(template_file) if substitutions['title'] is None: substitutions['title'] = '' try: result = chevron.render(template, substitutions) except chevron.ChevronError as e: raise UserError(f"Error processing template: {type(e).__name__}: {e}") with open(output_file, 'w') as result_file: result_file.write(result) if verbose: print(f'Output file generated: {output_location}') if report: print(output_location)
def set_echonest_key(self): self.echonest_key = read_lines_from_file(ConnectionSettings.echonest_key_path)[0]
def set_musescore_key(self): self.musescore_key = read_lines_from_file(ConnectionSettings.musescore_key_path)[0]
def __read_config(self): """Read grub configuration file, pass lines to __evaluate().""" grub_lines = utils.read_lines_from_file(self.menu) for line in grub_lines: self.__evaluate(line) self.__find_separator()
def sample_signature(sequences, allowed_frames, image_size, allow_mirror): """ Function to create a unique batch signature for the Dreyeve dataset. :param sequences: sequences to sample from. :param allowed_frames: range of allowed frames to sample the sequence start from. :param image_size: in the form (h,w). Needed to crop randomly. :param allow_mirror: whether or not to enable random mirroring. :return: a tuple like (num_run, start, hc1, hc2, wc1, wc2, do_mirror). """ h, w = image_size h_c = h // 4 w_c = w // 4 h_before_crop, w_before_crop = frame_size_before_crop # get random sequence num_run = choice(sequences) # get random start of sequence p = np.ones(total_frames_each_run) mask = np.zeros(total_frames_each_run) mask[np.array(allowed_frames)] = 1 if force_sample_steering: steering_dir_file = join(dreyeve_dir, '{:02d}'.format(num_run), 'steering_directions.txt') steering_dirs = read_lines_from_file(steering_dir_file) prob_straight = 1 - float( len([s for s in steering_dirs if s == 'STRAIGHT' ])) / len(steering_dirs) prob_left = 1 - float(len([s for s in steering_dirs if s == 'LEFT' ])) / len(steering_dirs) prob_right = 1 - float(len([s for s in steering_dirs if s == 'RIGHT' ])) / len(steering_dirs) p[[ i for i in xrange(0, len(steering_dirs)) if steering_dirs[i] == 'STRAIGHT' ]] = prob_straight p[[ i for i in xrange(0, len(steering_dirs)) if steering_dirs[i] == 'LEFT' ]] = prob_left p[[ i for i in xrange(0, len(steering_dirs)) if steering_dirs[i] == 'RIGHT' ]] = prob_right p *= mask p /= np.sum(p) start = np.random.choice(range(0, total_frames_each_run), p=p) # get random crop if crop_type == 'central': hc1 = h_before_crop // 4 wc1 = w_before_crop // 4 elif crop_type == 'random': hc1 = np.random.randint(0, h_before_crop - h_c) wc1 = np.random.randint(0, w_before_crop - w_c) else: raise ValueError hc2 = hc1 + h_c wc2 = wc1 + w_c do_mirror = choice([True, False]) if allow_mirror else False return tuple((num_run, start, hc1, hc2, wc1, wc2, do_mirror))
def load_queries(queries_dir): queries_dict = {} for i, line in enumerate(read_lines_from_file(queries_dir)): args = line.split(":") queries_dict[args[0]] = ' '.join(args[1:len(args)]) return queries_dict
def main(options, profile): try: match_groups = parse_config.parse_file(os.path.join(options["main_dir_path"], "chtf-conf.yaml")) except parse_config.ConfigError as err: log.critical("{}.".format(err)) utils.exit(1) if options["output_oneline"]: longest_name_len = len(max([match_group["name"] for match_group in match_groups], key=len)) found_threads_urls_prev = [] url_cache_file_path = os.path.join(options["cache_dir_path"], "prev_urls") with contextlib.suppress(FileNotFoundError): for line in utils.read_lines_from_file(url_cache_file_path): found_threads_urls_prev.append(line.strip()) all_found_threads_amount = 0 new_found_threads_amount = 0 colour = TerminalColour(options["colour_output"]) board_cache_dir_path = os.path.join(options["cache_dir_path"], "boards") found_threads_urls = [] date_start = datetime.datetime.now() for match_group, threads in core.generate_threads( options["chan"], match_groups, board_cache_dir_path, options["core_dl_sleep_time"] ): if options["output_oneline"]: oneline_match_name = "{:{}} ".format(match_group["name"], longest_name_len) print(" {}".format(oneline_match_name), end="") else: oneline_match_name = "" print(":: {}".format(match_group["name"]), end="") sys.stdout.flush() found_threads_amount = 0 for thread in threads: log.debug("Thread {} matches keyword {}.".format(thread["url_short"], thread["matching_keyword"])) found_threads_amount += 1 if options["no_duplicate_threads"]: if thread["url_short"] in found_threads_urls: found_threads_urls.append(thread["url_short"]) continue if not options["output_oneline"] and found_threads_amount == 1: print() else: utils.clear_terminal_line() found_threads_urls.append(thread["url_short"]) thread_date = datetime.datetime.fromtimestamp(thread["timestamp"]) term_len = utils.get_terminal_line_len() output_prefix = " " if thread["url_short"] not in found_threads_urls_prev: output_prefix = " {}!{} ".format(colour.get("IGreen"), colour.get("Reset")) if options["colour_output"]: # The colour code len for the new thread indicator ("!"). term_len += 11 output_page = thread["page"] if options["colour_output"]: if output_page <= 3: page_colour = colour.get("IGreen") elif output_page <= 7: page_colour = colour.get("IYellow") elif output_page >= 8: page_colour = colour.get("IRed") else: page_colour = "" output = ( "{prefix}{match_name}/{board:<3} {date} {replies:<3} " "{page_col}{page:<2}{reset} {url:<45} ".format( board=thread["board"] + "/", replies=thread["replies"], page=output_page, url=thread["url"], page_col=page_colour, date=utils.pretty_date_delta(thread_date), reset=colour.get("Reset"), prefix=output_prefix, match_name=oneline_match_name, ) ) thread_subject = thread.get("subject", False) if thread_subject: thread_subject = thread_subject.encode("ascii", "replace").decode("ascii", "replace") thread_subject = utils.strip_html_tags(thread_subject) thread_subject = html.unescape(thread_subject) output += "sub: {}".format(thread_subject) thread_comment = thread.get("comment", False) if thread_comment: thread_comment = thread_comment.encode("ascii", "replace").decode("ascii", "replace") thread_comment = thread_comment.replace("<br>", " ") thread_comment = utils.strip_html_tags(thread_comment) thread_comment = html.unescape(thread_comment) if thread_subject: output += " | " output += "com: {}".format(thread_comment) if options["colour_output"]: # The colour code len for the page number. term_len += 11 print(output[:term_len]) if thread["url_short"] not in found_threads_urls_prev: new_found_threads_amount += 1 if match_group["urlsavelast"]: urlsavelast_dir_path = os.path.join( options["main_dir_path"], "urls_last", match_group["name"], date_start.strftime("%Y"), date_start.strftime("%Y-%m"), ) with contextlib.suppress(FileExistsError): os.makedirs(urlsavelast_dir_path) urlsavelast_file_path = os.path.join(urlsavelast_dir_path, date_start.strftime("%Y-%m-%d")) utils.append_data_to_file(thread["url"] + "\n", urlsavelast_file_path) log.info("Saved thread url {} to file {}".format(thread["url"], urlsavelast_file_path)) if match_group["browser"]: try: utils.open_in_web_browser(thread["url"]) except utils.Error as err: log.error("{}.".format(err)) if found_threads_amount == 0: utils.clear_terminal_line() continue else: all_found_threads_amount += found_threads_amount with contextlib.suppress(FileNotFoundError): os.remove(url_cache_file_path) for url in found_threads_urls: utils.append_data_to_file(url + "\n", url_cache_file_path) dead_threads_amount = 0 for url in found_threads_urls_prev: if url not in found_threads_urls: dead_threads_amount += 1 date_end = datetime.datetime.now() date_next_refresh = date_end + datetime.timedelta(seconds=options["refresh_time"]) print( "\n" "{} thread{}, {} unique; {} new; " "{} that matched on the previous run but not now.\n" "\n" "Start time: {}.\n" "End time: {}.\n" "Next refresh: {}.".format( all_found_threads_amount, "" if all_found_threads_amount == 1 else "s", len(set(found_threads_urls)), "no" if new_found_threads_amount == 0 else new_found_threads_amount, dead_threads_amount, date_start.strftime("%Y-%m-%d %H:%M:%S"), date_end.strftime("%Y-%m-%d %H:%M:%S"), date_next_refresh.strftime("%Y-%m-%d %H:%M:%S"), ), end="", )
def process_grobid(grobid_files_directory): """Perform processing of the Grobid output files to get the relevant parts from it. Args: grobid_files_directory: (string) the folder with the outputs of Grobid. Returns: (string). The directory of the folder with the processed files. """ temp_output_dir = str(time.time()) os.mkdir(temp_output_dir) paragraph_begin = ['<p>'] paragraph_end = '</p>' fig_begin = ['<figDesc>'] fig_end = '</figDesc>' title_begin = ['<title level=\"a\" type=\"main\">'] title_end = '</title>' intro_begin = ['<div xmlns=\"http://www.tei-c.org/ns/1.0\"><head n=\"1\">Introduction</head>', '<div xmlns=\"http://www.tei-c.org/ns/1.0\"><head n=\"1\">INTRODUCTION</head>', '<div xmlns=\"http://www.tei-c.org/ns/1.0\"><head n=\"1\">introduction</head>', '<div xmlns=\"http://www.tei-c.org/ns/1.0\"><head>Introduction</head>', '<div xmlns=\"http://www.tei-c.org/ns/1.0\"><head>INTRODUCTION</head>', '<div xmlns=\"http://www.tei-c.org/ns/1.0\"><head>introduction</head>', '<div xmlns=\"http://www.tei-c.org/ns/1.0\"><head n=\"1.\">Introduction</head>', '<div xmlns=\"http://www.tei-c.org/ns/1.0\"><head n=\"1.\">INTRODUCTION</head>', '<div xmlns=\"http://www.tei-c.org/ns/1.0\"><headn=\"1.\">introduction</head>' ] intro_end = '</div>' for single_dir in os.listdir(grobid_files_directory): if not os.path.isfile(grobid_files_directory + '/' + single_dir) or single_dir == '.DS_Store': continue article_lines = utils.read_lines_from_file(grobid_files_directory + '/' + single_dir) article_text = get_article_text(article_lines, paragraph_begin, paragraph_end) figure_captions = get_article_text(article_lines, fig_begin, fig_end) abstract = get_article_abstract(article_lines) title = get_article_text(article_lines, title_begin, title_end) introduction = get_article_text(article_lines, intro_begin, intro_end) with open(temp_output_dir + '/' + single_dir, 'w+') as output_file: for line in article_text: if len(line) != 0: output_file.write(line + '\n') output_file.write('<figcaptions>\n') for caption in figure_captions: output_file.write(caption + '\n') output_file.write('<title>\n') if len(title) > 0: output_file.write(title[0] + '\n') output_file.write('<abstract>\n') output_file.write(abstract + '\n') output_file.write('<introduction>\n') if len(introduction) > 0: output_file.write(introduction[0] + '\n') return temp_output_dir