Пример #1
0
    def fuzzy_find(self, keywords, threshold=50, max_return=10, exclude=None,
                   by_index=True):
        """Find a record using keywords.

        It looks for keywords in the title/authors/keywords
        (for as much is available). Using the diflib package it creates
        a ranking based on token set matching.

        Arguments
        ---------
        keywords: str
            A string of keywords together, can be a combination.
        threshold: float
            Don't return records below this threshold.
        max_return: int
            Maximum number of records to return.
        exclude: list, np.ndarray
            List of indices that should be excluded in the search. You would
            put papers that were already labeled here for example.
        by_index: bool
            If True, use internal indexing.
            If False, use record ids for indexing.
        Returns
        -------
        list:
            Sorted list of indexes that match best the keywords.
        """
        match_str = np.full(len(self), "x", dtype=object)

        all_titles = self.title
        all_authors = self.authors
        all_keywords = self.keywords
        for i in range(len(self)):
            match_list = []
            if all_authors is not None:
                match_list.append(format_to_str(all_authors[i]))
            match_list.append(all_titles[i])
            if all_keywords is not None:
                match_list.append(format_to_str(all_keywords[i]))
            match_str[i, ] = " ".join(match_list)
        new_ranking = get_fuzzy_scores(keywords, match_str)
        sorted_idx = np.argsort(-new_ranking)
        best_idx = []
        if exclude is None:
            exclude = np.array([], dtype=int)
        for idx in sorted_idx:
            if ((not by_index and self.df.index.values[idx] in exclude)
                    or by_index and idx in exclude):
                continue
            if len(best_idx) >= max_return:
                break
            if len(best_idx) > 0 and new_ranking[idx] < threshold:
                break
            best_idx.append(idx)
        fuzz_idx = np.array(best_idx, dtype=np.int)
        if not by_index:
            fuzz_idx = self.df.index.values[fuzz_idx]
        return fuzz_idx.tolist()
Пример #2
0
    def match_string(self):
        match_str = np.full(len(self), "x", dtype=object)

        all_titles = self.title
        all_authors = self.authors
        all_keywords = self.keywords
        for i in range(len(self)):
            match_list = []
            if all_authors is not None:
                match_list.append(format_to_str(all_authors[i]))
            match_list.append(all_titles[i])
            if all_keywords is not None:
                match_list.append(format_to_str(all_keywords[i]))
            match_str[i, ] = " ".join(match_list)
        return match_str
Пример #3
0
    def format(self, use_cli_colors=True):
        """Format one record for displaying in the CLI.

        Arguments
        ---------
        use_cli_colors: bool
            Some terminals support colors, set to True to use them.

        Returns
        -------
        str:
            A string including title, abstracts and authors.
        """
        if self.title is not None:
            title = self.title
            if use_cli_colors:
                title = "\033[95m" + title + "\033[0m"
            title += "\n"
        else:
            title = ""

        if self.authors is not None and len(self.authors) > 0:
            authors = format_to_str(self.authors) + "\n"
        else:
            authors = ""

        if self.abstract is not None and len(self.abstract) > 0:
            abstract = self.abstract
            abstract = "\n" + abstract + "\n"
        else:
            abstract = ""

        return ("\n\n----------------------------------"
                f"\n{title}{authors}{abstract}"
                "----------------------------------\n\n")
Пример #4
0
def preview_record(record, w_title=80, w_authors=40, automatic_width=False):
    """Return a single line preview string for record i.

    Arguments
    ---------
    record: PaperRecord
        The paperRecord to preview.
    w_title: int
        Width to be allocated for the title of the paper.
    w_authors: int
        Width to be allocated for the authors of the paper.
    automatic_width: bool
        If true, compute w_title, w_authors from the console width.

    Returns
    -------
    str:
        A string that previews a paper record.
    """
    if automatic_width:
        term_width = os.get_terminal_size().columns
        width_available = term_width - 7
        w_title = round((2 / 3) * width_available)
        w_authors = width_available - w_title
    title_str = ""
    author_str = ""
    heading = record.title
    if heading is None:
        heading = record.abstract
    if heading is not None:
        if len(heading) > w_title:
            title_str = heading[:w_title - 2] + ".."
        else:
            title_str = heading

    if record.authors is not None:
        cur_authors = format_to_str(record.authors)
        if len(cur_authors) > w_authors:
            author_str = cur_authors[:w_authors - 2] + ".."
        else:
            author_str = cur_authors
    format_str = "{0: <" + str(w_title) + "}   " + "{1: <" + str(w_authors)
    format_str += "}"
    prev_str = format_str.format(title_str, author_str)
    return prev_str