def get_related_figures(identifiers_dir, text_data_dir, embeddings_dir,
                        test_identifiers_dir, output_dir):
    """Get semantically related figures for a set of test figures.

    Args:
      identifiers_dir: (string) identifiers of all figures in the collection.
      text_data_dir: (string) the file with the text for each figure (for keyword retrieval purposes).
      embeddings_dir: (string) the embedding vectors for all figures in the collection.
      test_identifiers_dir: (string) the figures for which we want to find related figures (a subset of full collection)
      output_dir: (string) directory for the output data.

    Returns:
      None. Outputs the related figures to a file.
    """
    test_identifiers = utils.read_lines_from_file(test_identifiers_dir)
    all_identifiers = utils.read_lines_from_file(identifiers_dir)
    tf_idf_matrix = KnnSearcher.get_tf_idf_embeddings(text_data_dir)
    searcher = KnnSearcher(tf_idf_matrix, all_identifiers, 100)
    initial_result_list = searcher.perform_keyword_retrieval(test_identifiers)
    embedding_matrix = utils.load_embeddings(embeddings_dir)
    final_result_list = re_rank_with_embeddings(initial_result_list,
                                                embedding_matrix,
                                                all_identifiers)

    with open(output_dir, 'w+') as output_file:
        for figure_id in final_result_list:
            line = figure_id
            for other_figure in final_result_list[figure_id]:
                if other_figure[0] != figure_id:
                    line += ',' + other_figure[0]
            output_file.write(line + '\n')
def get_tf_idf_vectors(data_dir, ids_dir, min_df=5):
    ids = utils.read_lines_from_file(ids_dir)
    count_vector = CountVectorizer(min_df=min_df)
    tf_idf_transformer = TfidfTransformer()
    data_lines = utils.read_lines_from_file(data_dir)
    tf_vectors = count_vector.fit_transform(data_lines)
    tf_idf_vectors = tf_idf_transformer.fit_transform(tf_vectors)
    return tf_idf_vectors, ids
示例#3
0
    def parse(self):
        grub_lines = utils.read_lines_from_file(self.menu)
        block_num = 0
        last_title = ""
        block_value = []

        for line in grub_lines:
            if line[:5] == "title":
                block_num += 1
                block_value = []
                last_title = str(block_num)
                block_value.append((line[:5], line[6:]))
                self.title_blocks[str(block_num)] = block_value
                continue

            if line[:6] == "kernel":
                self.title_blocks[last_title].append((line[:6], line[7:]))
                continue

            if line[:6] == "initrd":
                self.title_blocks[last_title].append((line[:6], line[7:]))
                continue

            if line[:6] == "append":
                self.title_blocks[last_title].append((line[:6], line[7:]))
                continue
 def parse(self):
     grub_lines = utils.read_lines_from_file(self.menu)
     block_num = 0
     last_title = ""
     block_value = []
     
     for line in grub_lines:
         if line[:5] == "title":
             block_num += 1
             block_value = []
             last_title = str(block_num)
             block_value.append((line[:5],line[6:]))
             self.title_blocks[str(block_num)] = block_value
             continue
 
         if line[:6] == "kernel":
             self.title_blocks[last_title].append((line[:6],line[7:]))
             continue
         
         if line[:6] == "initrd":
             self.title_blocks[last_title].append((line[:6],line[7:]))
             continue
                    
         if line[:6] == "append":
             self.title_blocks[last_title].append((line[:6],line[7:]))
             continue
    def __change_config(self, identifier, old_value, new_value):
        """Change the configuration file based on the passed values."""
        line_number = self.__get_line_number(identifier)
        lines = utils.read_lines_from_file(self.menu)
        if line_number == -1:
            line_number = 0
            lines.insert(0,"\n")
        line = lines[line_number]
        if old_value == None:
            lines[line_number] = new_value
        elif old_value == "vga=":
            place = line.find(old_value)
            end = line.find(" ", place)
            if place != -1:
                if end != -1:
                    line = line[:place] + new_value + line[end:]
                else:
                    line = line[:place] + new_value + "\n"
            else:
                line = line[:-1] + " " + new_value + "\n"
            lines[line_number] = line
        else:
            if old_value[0] != "#" and line.find(" " + old_value) != -1:
                old_value = " " + old_value

            if old_value[0] == "#" and line.find("# " + old_value[1:]) != -1:
                old_value = "# " + old_value[1:]

            line = line.replace(old_value, new_value)
            lines[line_number] = line

        utils.write_lines_to_file(self.menu, lines)
示例#6
0
    def __change_config(self, identifier, old_value, new_value):
        """Change the configuration file based on the passed values."""
        line_number = self.__get_line_number(identifier)
        lines = utils.read_lines_from_file(self.menu)
        if line_number == -1:
            line_number = 0
            lines.insert(0, "\n")
        line = lines[line_number]
        if old_value == None:
            lines[line_number] = new_value
        elif old_value == "vga=":
            place = line.find(old_value)
            end = line.find(" ", place)
            if place != -1:
                if end != -1:
                    line = line[:place] + new_value + line[end:]
                else:
                    line = line[:place] + new_value + "\n"
            else:
                line = line[:-1] + " " + new_value + "\n"
            lines[line_number] = line
        else:
            if old_value[0] != "#" and line.find(" " + old_value) != -1:
                old_value = " " + old_value

            if old_value[0] == "#" and line.find("# " + old_value[1:]) != -1:
                old_value = "# " + old_value[1:]

            line = line.replace(old_value, new_value)
            lines[line_number] = line

        utils.write_lines_to_file(self.menu, lines)
def get_figure_mentions_by_words(grobid_article_dir, figure_num, window_size,
                                 captions_dict, mentions_dict):
    """Get the text of figure mentions in the article using word windows.

    Args:
      grobid_article_dir: (string) the directory the processed Grobid file of the article.
      figure_num: (string) the figure number.
      window_size: (int) the number of words that should surround a figure mention.
      captions_dict: (dictionary) the captions of the figures.
      mentions_dict: (dictionary) the locations in the texts where the mentions are.

    Returns:
      (list). The merged representation.
    """
    mentions_text = '\"...'
    caption_text = ''

    article_lines = utils.read_lines_from_file(grobid_article_dir)
    doc_txt = ''

    for line in article_lines:
        line = line.rstrip('\n')
        doc_txt += line + ' '
    doc_txt = doc_txt.rstrip(' ')
    words = doc_txt.split(' ')

    if str(figure_num) in captions_dict.keys():
        for caption_id in captions_dict[str(figure_num)]:
            if len(article_lines[caption_id]) < len(caption_text) or len(
                    caption_text) == 0:
                caption_text = article_lines[caption_id] + ' '
        caption_text = caption_text.rstrip(' ')
        caption_text = utils.process_caption(caption_text, figure_num)

    if str(figure_num) in mentions_dict.keys():
        all_summary_ids = []
        for word_id in mentions_dict[str(figure_num)]:
            summary_ids = []
            for i in range(window_size, 0, -1):
                summary_ids += [word_id - i]

            summary_ids += [word_id]
            for i in range(1, window_size + 1):
                summary_ids += [word_id + i]

            all_summary_ids += [summary_ids]

        all_summary_ids = merge_texts(all_summary_ids)

        for summary_ids in all_summary_ids:
            for i in summary_ids:
                if i < 0 or i >= len(words):
                    continue
                mentions_text += ' ' + words[i]
            mentions_text += ' ... '
    mentions_text = mentions_text.rstrip(' ') + '\"'

    return caption_text, mentions_text
示例#8
0
 def __has_separate_boot_partition(self):
     """Try to find out if there is a separate boot partition.
     Return True if it is likely(If /etc/fstab
     has a line containing '/boot', that is)
     """
     for line in utils.read_lines_from_file(self.fstab):
         if line.find("/boot") != -1:
             return True
     return False
 def __has_separate_boot_partition(self):
     """Try to find out if there is a separate boot partition.
     Return True if it is likely(If /etc/fstab
     has a line containing '/boot', that is)
     """
     for line in utils.read_lines_from_file(self.fstab):
         if line.find("/boot") != -1:
             return True
     return False
示例#10
0
def read_lines_from_cached_file_legacy(template_file):
    lines = CACHED_FILES.get(template_file)
    if lines is None:
        lines = read_lines_from_file(template_file)
        lines = re.sub(LEGACY_PLACEHOLDERS_UNESCAPED_REPLACEMENT_PATTERN,
                       r'\1{{{\2}}}', lines)
        lines = re.sub(LEGACY_PLACEHOLDERS_REPLACEMENT_PATTERN, r'\1{{\2}}',
                       lines)
        CACHED_FILES[template_file] = lines
    return lines
示例#11
0
def extract_all_figures(all_figures_dir, count_threshold=5):
    """Generate textual representations to all figures in the collection.

    Args:
      all_figures_dir: (string) the folder with the textual fields for the figures (the files used to build an index).
      count_threshold: (int) number of words to consider a figure.

    Returns:
      (list, list, list). figure tokens, figure identifiers, figure_image_files.
    """

    text_data, figure_identifiers, image_file_names = [], [], []
    for i, article_file_name in enumerate(os.listdir(all_figures_dir)):
        figure_flag = False
        caption, mention, figure_num, image_file_name = '', '', '', ''

        for line in utils.read_lines_from_file(all_figures_dir + '/' +
                                               article_file_name):
            if '<figure>' in line:
                if figure_flag:
                    tokens = text_to_tokens(caption + ' ' + mention)
                    if len(tokens) > count_threshold:
                        text_data.append(tokens)
                        figure_identifiers.append(
                            article_file_name.split('.')[0] + '_' + figure_num)
                        image_file_names.append(image_file_name)
                figure_flag = False
                caption, mention, figure_num, image_file_name = '', '', '', ''

                if '<figure>' in line:
                    figure_num = line.rstrip('\n').rstrip('</figure>').lstrip(
                        '<figure>')
                    if len(figure_num) == 2:
                        figure_num = figure_num.replace('.', '')
                    figure_flag = True

            if '<caption>' in line and figure_flag:
                caption = line.rstrip('\n').rstrip('</caption>').lstrip(
                    '<caption>')
            if '<lines3>' in line and figure_flag:
                mention = line.rstrip('\n').rstrip('</lines3>').lstrip(
                    '<lines3>')
            if '<file>' in line and figure_flag:
                image_file_name = line.rstrip('\n').rstrip('</file>').lstrip(
                    '<file>')

        if figure_flag:
            tokens = text_to_tokens(caption + ' ' + mention)
            if len(tokens) > count_threshold:
                text_data.append(tokens)
                figure_identifiers.append(
                    article_file_name.split('.')[0] + '_' + figure_num)
                image_file_names.append(image_file_name)

    return text_data, figure_identifiers, image_file_names
def parse_qrel_file(qrel_dir):
    qrel_dict = {}
    lines = utils.read_lines_from_file(qrel_dir, delimiter=' ')

    for line in lines:
        label = line[3]
        if label == '1':
            qid = line[0]
            doc = line[2]
            qrel_dict[qid] = qrel_dict.get(qid, []) + [doc]
    return qrel_dict
示例#13
0
 def __fix_menu_lst_bug(self):
     """Take care of an annoying thing in menu.lst."""
     grub_lines = utils.read_lines_from_file(self.menu)
     password_lines = []
     for line in grub_lines:
         line.strip()
         if line[0] == "#":
             line = line[1:].strip()
         if line[:8] == "password":
             password_lines.append(line)
     if len(password_lines) == 2:
         self.__change_config(password_lines[0], None, "## " + 
                              password_lines[0] + "\n")
示例#14
0
 def __fix_menu_lst_bug(self):
     """Take care of an annoying thing in menu.lst."""
     grub_lines = utils.read_lines_from_file(self.menu)
     password_lines = []
     for line in grub_lines:
         line.strip()
         if line[0] == "#":
             line = line[1:].strip()
         if line[:8] == "password":
             password_lines.append(line)
     if len(password_lines) == 2:
         self.__change_config(password_lines[0], None,
                              "## " + password_lines[0] + "\n")
示例#15
0
 def __get_line_number(self, identifier):
     """Return the number of the line that 
     begin with the string identifier.
     """
     lines = utils.read_lines_from_file(self.menu)
     length = len(identifier)
     tracker = 0
     for line in lines:
         if line.strip()[:1] == "#":
             line = line.strip()[1:]
         line = line.strip()[:length].strip()
         if line == identifier:
             return tracker
         tracker += 1
     return -1
示例#16
0
 def __get_line_number(self, identifier):
     """Return the number of the line that 
     begin with the string identifier.
     """
     lines = utils.read_lines_from_file(self.menu)
     length = len(identifier)
     tracker = 0
     for line in lines:
         if line.strip()[:1] == "#":
             line = line.strip()[1:]
         line = line.strip()[:length].strip()
         if line == identifier:
             return tracker
         tracker += 1
     return -1
示例#17
0
    def get_tf_idf_embeddings(data_dir):
        """Get tf-idf matrix of the figures in the collection.

        Args:
          data_dir: (string) a file with the text data of the figures.

        Returns:
          tf.idf Matrix.
        """
        data_lines = utils.read_lines_from_file(data_dir)
        count_vector = CountVectorizer()
        tf_vectors = count_vector.fit_transform(data_lines)
        tf_idf_transformer = TfidfTransformer()
        tf_idf_vectors = tf_idf_transformer.fit_transform(tf_vectors)
        return tf_idf_vectors
示例#18
0
def get_captions(grobid_article_dir):
    """Get the captions of all figures from the Grobid processed files.

    Args:
      grobid_article_dir: (string) the directory the processed grobid file of the article.
    Returns:
      (dictionary). Mapping from figure identifiers to their caption.
    """
    figure_caption_dict = {}
    article_lines = utils.read_lines_from_file(grobid_article_dir)
    fig_captions_flag = False

    for line_id, line in enumerate(article_lines):
        if '<figcaptions>' in line:
            fig_captions_flag = True
            continue

        if '<title>' in line:
            break

        if not fig_captions_flag:
            continue

        words = utils.process_line(line)
        for i, word in enumerate(words):
            figure_flag = False
            for fig_format in figure_formats:
                if fig_format in word.lower():
                    figure_flag = True
            if not figure_flag:
                continue

            figure_num = utils.extract_number(words[i])

            try:
                int(figure_num)
            except:
                if len(words) > i + 1:
                    figure_num = utils.extract_number(words[i + 1])
                else:
                    continue

            if figure_num not in figure_caption_dict.keys():
                figure_caption_dict[figure_num] = []
            if line_id not in figure_caption_dict[figure_num]:
                figure_caption_dict[figure_num] += [line_id]

    return figure_caption_dict
示例#19
0
def get_article_field(grobid_article_dir, field_name):
    """Get a specific article field of the figure's article.

    Args:
      grobid_article_dir: (string) the directory the processed Grobid file of the article.
      field_name: (string) the field name such as 'introduction'
    Returns:
      (string). The content of the textual field.
    """
    output = ''
    predicate = False
    lines = utils.read_lines_from_file(grobid_article_dir)
    for line in lines:
        if predicate:
            output = line.rstrip('\n')
            predicate = False
        if '<' + field_name + '>' in line:
            predicate = True
    return output
示例#20
0
def get_figure_ids_list(grobid_article_dir):
    """Get the list of all figure identifiers in a single article.

    Args:
      grobid_article_dir: (string) the directory the processed grobid file of the article.
    Returns:
      (list). The list of figure identifiers (numbers stored as strings).
    """
    figure_list = []
    doc_txt = ''

    lines = utils.read_lines_from_file(grobid_article_dir)
    for line in lines:
        line = line.rstrip('\n')
        doc_txt += line + ' '
    doc_txt = doc_txt.rstrip(' ')
    words = doc_txt.split(' ')

    for word_id, word in enumerate(words):
        word = utils.process_line(word)
        if len(word) == 0:
            continue

        word = word[0]
        if word.split('.')[0].lower() in figure_formats:
            if len(word.split('.')) > 1:
                word_length = len(word.split('.'))
                figure_num = utils.extract_number('.'.join(
                    word.split('.')[1:word_length]))
            elif len(words) > word_id + 1:
                figure_num = utils.extract_number(words[word_id + 1])
            else:
                continue

            if figure_num not in figure_list and figure_num != '':
                figure_list += [figure_num]
    return figure_list
示例#21
0
    def __check_themes(self):
        """Check for installed themes, send info to class variables."""
        self.grub_images = []
        if self.__get_line_number("splash") != -1:
            grub_lines = utils.read_lines_from_file(self.menu)
            location = self.__get_line_number("splash")
            splash_line = grub_lines[location].strip()
            correct_path = splash_line.find(self.groot + 
                                            self.mounted_splashdir)
            if correct_path != -1:
                location = splash_line.rfind('/')
                file_name = splash_line[location + 1:].replace('\\', '')
                suffix = file_name.find(".xpm.gz")
                if suffix != -1:
                    name = file_name[:suffix]
                    for dirfile in os.listdir(self.mounted_splashdir):
                        if file_name == dirfile:
                            self.default_image = name

        grub_files = os.listdir(self.mounted_splashdir)
        for grub_file in grub_files:
            end = grub_file[grub_file[:-3].rfind("."):]
            if end == ".xpm.gz":
                self.grub_images.append(grub_file[:-7])
示例#22
0
    def __check_themes(self):
        """Check for installed themes, send info to class variables."""
        self.grub_images = []
        if self.__get_line_number("splash") != -1:
            grub_lines = utils.read_lines_from_file(self.menu)
            location = self.__get_line_number("splash")
            splash_line = grub_lines[location].strip()
            correct_path = splash_line.find(self.groot +
                                            self.mounted_splashdir)
            if correct_path != -1:
                location = splash_line.rfind('/')
                file_name = splash_line[location + 1:].replace('\\', '')
                suffix = file_name.find(".xpm.gz")
                if suffix != -1:
                    name = file_name[:suffix]
                    for dirfile in os.listdir(self.mounted_splashdir):
                        if file_name == dirfile:
                            self.default_image = name

        grub_files = os.listdir(self.mounted_splashdir)
        for grub_file in grub_files:
            end = grub_file[grub_file[:-3].rfind("."):]
            if end == ".xpm.gz":
                self.grub_images.append(grub_file[:-7])
示例#23
0
def md2html(document, plugins, metadata_handlers, options):
    input_location = document['input_file']
    output_location = document['output_file']
    title = document['title']
    template_file = document['template']
    link_css = document['link_css']
    include_css = document['include_css']
    force = document['force']
    verbose = document['verbose']
    report = document['report']

    output_file = Path(output_location)
    input_file = Path(input_location)

    if not force and output_file.exists():
        output_file_mtime = os.path.getmtime(output_file)
        input_file_mtime = os.path.getmtime(input_file)
        if output_file_mtime > input_file_mtime:
            if verbose:
                print(
                    f'The output file is up-to-date. Skipping: {output_location}'
                )
            return

    current_time = datetime.today()
    substitutions = {
        'title': title,
        'exec_name': EXEC_NAME,
        'exec_version': EXEC_VERSION,
        'generation_date': current_time.strftime('%Y-%m-%d'),
        'generation_time': current_time.strftime('%H:%M:%S')
    }
    styles = []
    if link_css:
        styles.extend([
            f'<link rel="stylesheet" type="text/css" href="{item}">'
            for item in link_css
        ])
    if include_css:
        styles.extend([
            '<style>\n' + read_lines_from_file(item) + '\n</style>'
            for item in include_css
        ])
    substitutions['styles'] = '\n'.join(styles) if styles else ''

    md_lines = read_lines_from_file(input_file)
    for plugin in plugins:
        plugin.new_page()
    md_lines = apply_metadata_handlers(md_lines, metadata_handlers, document)

    substitutions['content'] = MARKDOWN.convert(source=md_lines)

    for plugin in plugins:
        substitutions.update(plugin.variables(document))

    if options['legacy_mode']:
        placeholders = substitutions.get('placeholders')
        if placeholders is not None:
            del substitutions['placeholders']
            substitutions.update(placeholders)
        template = read_lines_from_cached_file_legacy(template_file)
    else:
        template = read_lines_from_cached_file(template_file)

    if substitutions['title'] is None:
        substitutions['title'] = ''

    try:
        result = chevron.render(template, substitutions)
    except chevron.ChevronError as e:
        raise UserError(f"Error processing template: {type(e).__name__}: {e}")

    with open(output_file, 'w') as result_file:
        result_file.write(result)

    if verbose:
        print(f'Output file generated: {output_location}')
    if report:
        print(output_location)
示例#24
0
 def set_echonest_key(self):
     self.echonest_key = read_lines_from_file(ConnectionSettings.echonest_key_path)[0]
示例#25
0
 def set_musescore_key(self):
     self.musescore_key = read_lines_from_file(ConnectionSettings.musescore_key_path)[0]
示例#26
0
 def __read_config(self):
     """Read grub configuration file, pass lines to __evaluate()."""
     grub_lines = utils.read_lines_from_file(self.menu)
     for line in grub_lines:
         self.__evaluate(line)
     self.__find_separator()
def sample_signature(sequences, allowed_frames, image_size, allow_mirror):
    """
    Function to create a unique batch signature for the Dreyeve dataset.

    :param sequences: sequences to sample from.
    :param allowed_frames: range of allowed frames to sample the sequence start from.
    :param image_size: in the form (h,w). Needed to crop randomly.
    :param allow_mirror: whether or not to enable random mirroring.
    :return: a tuple like (num_run, start, hc1, hc2, wc1, wc2, do_mirror).
    """
    h, w = image_size
    h_c = h // 4
    w_c = w // 4
    h_before_crop, w_before_crop = frame_size_before_crop

    # get random sequence
    num_run = choice(sequences)

    # get random start of sequence
    p = np.ones(total_frames_each_run)
    mask = np.zeros(total_frames_each_run)
    mask[np.array(allowed_frames)] = 1
    if force_sample_steering:
        steering_dir_file = join(dreyeve_dir, '{:02d}'.format(num_run),
                                 'steering_directions.txt')
        steering_dirs = read_lines_from_file(steering_dir_file)
        prob_straight = 1 - float(
            len([s for s in steering_dirs if s == 'STRAIGHT'
                 ])) / len(steering_dirs)
        prob_left = 1 - float(len([s for s in steering_dirs if s == 'LEFT'
                                   ])) / len(steering_dirs)
        prob_right = 1 - float(len([s for s in steering_dirs if s == 'RIGHT'
                                    ])) / len(steering_dirs)
        p[[
            i for i in xrange(0, len(steering_dirs))
            if steering_dirs[i] == 'STRAIGHT'
        ]] = prob_straight
        p[[
            i for i in xrange(0, len(steering_dirs))
            if steering_dirs[i] == 'LEFT'
        ]] = prob_left
        p[[
            i for i in xrange(0, len(steering_dirs))
            if steering_dirs[i] == 'RIGHT'
        ]] = prob_right
    p *= mask
    p /= np.sum(p)
    start = np.random.choice(range(0, total_frames_each_run), p=p)

    # get random crop
    if crop_type == 'central':
        hc1 = h_before_crop // 4
        wc1 = w_before_crop // 4
    elif crop_type == 'random':
        hc1 = np.random.randint(0, h_before_crop - h_c)
        wc1 = np.random.randint(0, w_before_crop - w_c)
    else:
        raise ValueError

    hc2 = hc1 + h_c
    wc2 = wc1 + w_c

    do_mirror = choice([True, False]) if allow_mirror else False

    return tuple((num_run, start, hc1, hc2, wc1, wc2, do_mirror))
示例#28
0
 def load_queries(queries_dir):
     queries_dict = {}
     for i, line in enumerate(read_lines_from_file(queries_dir)):
         args = line.split(":")
         queries_dict[args[0]] = ' '.join(args[1:len(args)])
     return queries_dict
示例#29
0
文件: cli.py 项目: nzcv/chtf
def main(options, profile):
    try:
        match_groups = parse_config.parse_file(os.path.join(options["main_dir_path"], "chtf-conf.yaml"))
    except parse_config.ConfigError as err:
        log.critical("{}.".format(err))
        utils.exit(1)

    if options["output_oneline"]:
        longest_name_len = len(max([match_group["name"] for match_group in match_groups], key=len))

    found_threads_urls_prev = []
    url_cache_file_path = os.path.join(options["cache_dir_path"], "prev_urls")
    with contextlib.suppress(FileNotFoundError):
        for line in utils.read_lines_from_file(url_cache_file_path):
            found_threads_urls_prev.append(line.strip())

    all_found_threads_amount = 0
    new_found_threads_amount = 0
    colour = TerminalColour(options["colour_output"])
    board_cache_dir_path = os.path.join(options["cache_dir_path"], "boards")
    found_threads_urls = []
    date_start = datetime.datetime.now()

    for match_group, threads in core.generate_threads(
        options["chan"], match_groups, board_cache_dir_path, options["core_dl_sleep_time"]
    ):
        if options["output_oneline"]:
            oneline_match_name = "{:{}} ".format(match_group["name"], longest_name_len)
            print("   {}".format(oneline_match_name), end="")
        else:
            oneline_match_name = ""
            print(":: {}".format(match_group["name"]), end="")
        sys.stdout.flush()

        found_threads_amount = 0

        for thread in threads:
            log.debug("Thread {} matches keyword {}.".format(thread["url_short"], thread["matching_keyword"]))

            found_threads_amount += 1
            if options["no_duplicate_threads"]:
                if thread["url_short"] in found_threads_urls:
                    found_threads_urls.append(thread["url_short"])
                    continue

            if not options["output_oneline"] and found_threads_amount == 1:
                print()
            else:
                utils.clear_terminal_line()

            found_threads_urls.append(thread["url_short"])
            thread_date = datetime.datetime.fromtimestamp(thread["timestamp"])
            term_len = utils.get_terminal_line_len()
            output_prefix = "   "

            if thread["url_short"] not in found_threads_urls_prev:
                output_prefix = " {}!{} ".format(colour.get("IGreen"), colour.get("Reset"))
                if options["colour_output"]:
                    # The colour code len for the new thread indicator ("!").
                    term_len += 11

            output_page = thread["page"]
            if options["colour_output"]:
                if output_page <= 3:
                    page_colour = colour.get("IGreen")
                elif output_page <= 7:
                    page_colour = colour.get("IYellow")
                elif output_page >= 8:
                    page_colour = colour.get("IRed")
            else:
                page_colour = ""

            output = (
                "{prefix}{match_name}/{board:<3}  {date}  {replies:<3}  "
                "{page_col}{page:<2}{reset}  {url:<45}  ".format(
                    board=thread["board"] + "/",
                    replies=thread["replies"],
                    page=output_page,
                    url=thread["url"],
                    page_col=page_colour,
                    date=utils.pretty_date_delta(thread_date),
                    reset=colour.get("Reset"),
                    prefix=output_prefix,
                    match_name=oneline_match_name,
                )
            )
            thread_subject = thread.get("subject", False)
            if thread_subject:
                thread_subject = thread_subject.encode("ascii", "replace").decode("ascii", "replace")
                thread_subject = utils.strip_html_tags(thread_subject)
                thread_subject = html.unescape(thread_subject)
                output += "sub: {}".format(thread_subject)
            thread_comment = thread.get("comment", False)
            if thread_comment:
                thread_comment = thread_comment.encode("ascii", "replace").decode("ascii", "replace")
                thread_comment = thread_comment.replace("<br>", " ")
                thread_comment = utils.strip_html_tags(thread_comment)
                thread_comment = html.unescape(thread_comment)
                if thread_subject:
                    output += " | "
                output += "com: {}".format(thread_comment)
            if options["colour_output"]:
                # The colour code len for the page number.
                term_len += 11
            print(output[:term_len])

            if thread["url_short"] not in found_threads_urls_prev:
                new_found_threads_amount += 1
                if match_group["urlsavelast"]:
                    urlsavelast_dir_path = os.path.join(
                        options["main_dir_path"],
                        "urls_last",
                        match_group["name"],
                        date_start.strftime("%Y"),
                        date_start.strftime("%Y-%m"),
                    )
                    with contextlib.suppress(FileExistsError):
                        os.makedirs(urlsavelast_dir_path)
                    urlsavelast_file_path = os.path.join(urlsavelast_dir_path, date_start.strftime("%Y-%m-%d"))
                    utils.append_data_to_file(thread["url"] + "\n", urlsavelast_file_path)
                    log.info("Saved thread url {} to file {}".format(thread["url"], urlsavelast_file_path))
                if match_group["browser"]:
                    try:
                        utils.open_in_web_browser(thread["url"])
                    except utils.Error as err:
                        log.error("{}.".format(err))
        if found_threads_amount == 0:
            utils.clear_terminal_line()
            continue
        else:
            all_found_threads_amount += found_threads_amount
    with contextlib.suppress(FileNotFoundError):
        os.remove(url_cache_file_path)
    for url in found_threads_urls:
        utils.append_data_to_file(url + "\n", url_cache_file_path)
    dead_threads_amount = 0
    for url in found_threads_urls_prev:
        if url not in found_threads_urls:
            dead_threads_amount += 1

    date_end = datetime.datetime.now()
    date_next_refresh = date_end + datetime.timedelta(seconds=options["refresh_time"])
    print(
        "\n"
        "{} thread{}, {} unique; {} new; "
        "{} that matched on the previous run but not now.\n"
        "\n"
        "Start time:    {}.\n"
        "End time:      {}.\n"
        "Next refresh:  {}.".format(
            all_found_threads_amount,
            "" if all_found_threads_amount == 1 else "s",
            len(set(found_threads_urls)),
            "no" if new_found_threads_amount == 0 else new_found_threads_amount,
            dead_threads_amount,
            date_start.strftime("%Y-%m-%d %H:%M:%S"),
            date_end.strftime("%Y-%m-%d %H:%M:%S"),
            date_next_refresh.strftime("%Y-%m-%d %H:%M:%S"),
        ),
        end="",
    )
示例#30
0
 def __read_config(self):
     """Read grub configuration file, pass lines to __evaluate()."""
     grub_lines = utils.read_lines_from_file(self.menu)
     for line in grub_lines:
         self.__evaluate(line)
     self.__find_separator()
def process_grobid(grobid_files_directory):
    """Perform processing of the Grobid output files to get the relevant parts from it.

    Args:
      grobid_files_directory: (string) the folder with the outputs of Grobid.

    Returns:
      (string). The directory of the folder with the processed files.
    """
    temp_output_dir = str(time.time())
    os.mkdir(temp_output_dir)

    paragraph_begin = ['<p>']
    paragraph_end = '</p>'
    fig_begin = ['<figDesc>']
    fig_end = '</figDesc>'
    title_begin = ['<title level=\"a\" type=\"main\">']
    title_end = '</title>'
    intro_begin = ['<div xmlns=\"http://www.tei-c.org/ns/1.0\"><head n=\"1\">Introduction</head>',
                   '<div xmlns=\"http://www.tei-c.org/ns/1.0\"><head n=\"1\">INTRODUCTION</head>',
                   '<div xmlns=\"http://www.tei-c.org/ns/1.0\"><head n=\"1\">introduction</head>',
                   '<div xmlns=\"http://www.tei-c.org/ns/1.0\"><head>Introduction</head>',
                   '<div xmlns=\"http://www.tei-c.org/ns/1.0\"><head>INTRODUCTION</head>',
                   '<div xmlns=\"http://www.tei-c.org/ns/1.0\"><head>introduction</head>',
                   '<div xmlns=\"http://www.tei-c.org/ns/1.0\"><head n=\"1.\">Introduction</head>',
                   '<div xmlns=\"http://www.tei-c.org/ns/1.0\"><head n=\"1.\">INTRODUCTION</head>',
                   '<div xmlns=\"http://www.tei-c.org/ns/1.0\"><headn=\"1.\">introduction</head>'
                   ]
    intro_end = '</div>'

    for single_dir in os.listdir(grobid_files_directory):
        if not os.path.isfile(grobid_files_directory + '/' + single_dir) or single_dir == '.DS_Store':
            continue

        article_lines = utils.read_lines_from_file(grobid_files_directory + '/' + single_dir)
        article_text = get_article_text(article_lines, paragraph_begin, paragraph_end)
        figure_captions = get_article_text(article_lines, fig_begin, fig_end)
        abstract = get_article_abstract(article_lines)
        title = get_article_text(article_lines, title_begin, title_end)
        introduction = get_article_text(article_lines, intro_begin, intro_end)

        with open(temp_output_dir + '/' + single_dir, 'w+') as output_file:
            for line in article_text:
                if len(line) != 0:
                    output_file.write(line + '\n')

            output_file.write('<figcaptions>\n')
            for caption in figure_captions:
                output_file.write(caption + '\n')

            output_file.write('<title>\n')
            if len(title) > 0:
                output_file.write(title[0] + '\n')

            output_file.write('<abstract>\n')
            output_file.write(abstract + '\n')

            output_file.write('<introduction>\n')
            if len(introduction) > 0:
                output_file.write(introduction[0] + '\n')
    return temp_output_dir