예제 #1
0
    def _get_preprocessed_df(
        self, preprocessed_filename: str, df_articles: DataFrame, document_type: DocumentType, overwrite: bool
    ) -> DataFrame:
        """
        Helper function to get the preprocessed pandas dataframe. If the preprocessing already was done ones (JSON files
        exist) the tagging is not done again but the json files with the perprocessing are read into a pandas dataframe.
        If preprocessing is proceeded, the result will be stored in a json file. According to the document type, a
        different preprocessing is done.
        :param preprocessed_filename: Name of json file to store/ read the results of preprocessing.
        :param df_articles: Dataframe with the text to preprocess, if the data still needs to be preprocessed.
        :param document_type: Type of the document that is going to be preprocessed.
        :param overwrite: Determines if the previous data is allowed to be overwritten.
        :return: df_preprocessed: Pandas dataframe of the preprocessed input.
        """
        json_path = "src/output/" + preprocessed_filename + ".json"

        if Path(json_path).exists() and not overwrite:
            return Reader.read_json_to_df_default(json_path)

        if document_type.value == DocumentType.ARTICLE.value:
            df_preprocessed = self._apply_preprocessing(df_articles, document_type, FilterType.PARTIES)
        elif document_type.value == DocumentType.PARAGRAPH.value:
            df_preprocessed = self._preprocess_paragraphs(df_articles)
        else:
            df_articles = df_articles[["title", "media"]].rename(columns={"title": "text"})
            df_preprocessed = self._apply_preprocessing(df_articles, document_type, FilterType.NONE)

        return df_preprocessed
예제 #2
0
 def __init__(self, path: str):
     self.dataframe = Reader.read_json_to_df_default(path)
     self.tfidf_sentiment = TfidfSentiment(self.dataframe)