예제 #1
0
    def zip_workspace(self) -> str:
        """Sends a zip file containing a pickle file of session & its folder.

        :return: the path of the zipped workspace
        """
        # TODO: move this to matrix model
        # initialize the save path
        save_path = os.path.join(
            constants.UPLOAD_FOLDER,
            constants.WORKSPACE_DIR)
        rounded_next_id = str(self.next_id % 10000)  # take the last 4 digit
        workspace_file_path = os.path.join(
            constants.UPLOAD_FOLDER,
            rounded_next_id + '_' + constants.WORKSPACE_FILENAME)

        # remove unnecessary content in the workspace
        try:
            shutil.rmtree(
                os.path.join(
                    session_manager.session_folder(),
                    constants.RESULTS_FOLDER))
            # attempt to remove result folder(CSV matrix that kind of crap)
        except FileNotFoundError:
            pass

        # move session folder to work space folder
        try:
            # try to remove previous workspace in order to resolve conflict
            os.remove(workspace_file_path)
        except FileNotFoundError:
            pass
        try:
            # empty the save path in order to resolve conflict
            shutil.rmtree(save_path)
        except FileNotFoundError:
            pass
        general_functions.copy_dir(session_manager.session_folder(), save_path)

        # save session in the work space folder
        session_manager.save(save_path)

        # zip the dir
        zip_file = zipfile.ZipFile(workspace_file_path, 'w')
        general_functions.zip_dir(save_path, zip_file)
        zip_file.close()
        # remove the original dir
        shutil.rmtree(save_path)

        return workspace_file_path
예제 #2
0
    def get_download_path(self) -> str:
        """Write the generated top word results to an output CSV file.

        :return: path of the generated CSV file.
        """
        # Get topword result.
        topword_result = self._get_result()

        # Get the default saving directory of topword result.
        result_folder_path = os.path.join(session_manager.session_folder(),
                                          RESULTS_FOLDER)

        # Attempt to make the directory.
        if not os.path.isdir(result_folder_path):
            os.makedirs(result_folder_path)

        # Get the complete saving path of topword result.
        save_path = os.path.join(result_folder_path, TOPWORD_CSV_FILE_NAME)

        # Write to the file.
        with open(save_path, 'w', encoding='utf-8') as file:
            # Write header to the file.
            file.write(topword_result.header + '\n')
            # Write results to the file.
            # Since we want indexes and data in rows, we get the transpose.
            for result in topword_result.results:
                file.write(
                    pd.DataFrame(result).transpose().to_csv(header=True))

        return save_path
예제 #3
0
    def get_download_path(self) -> str:
        """Write the generated top word results to an output CSV file.

        :return: path of the generated CSV file.
        """
        # Get topword result.
        topword_result = self._get_result()

        # Get the default saving directory of topword result.
        result_folder_path = os.path.join(
            session_manager.session_folder(), RESULTS_FOLDER)

        # Attempt to make the directory.
        if not os.path.isdir(result_folder_path):
            os.makedirs(result_folder_path)

        # Get the complete saving path of topword result.
        save_path = os.path.join(result_folder_path, TOPWORD_CSV_FILE_NAME)

        # Write to the file.
        with open(save_path, 'w', encoding='utf-8') as file:
            # Write header to the file.
            file.write(topword_result.header + '\n')
            # Write results to the file.
            # Since we want indexes and data in rows, we get the transpose.
            for result in topword_result.results:
                file.write(
                    pd.DataFrame(result).transpose().to_csv(header=True))

        return save_path
예제 #4
0
def generate_rw_matrix(data_list):
    """
    Generates rolling windows graph raw data matrix

    Args:
        data_list: a list of [x, y] points

    Returns:
        Output file path and extension.
    """

    extension = '.csv'
    deliminator = ','

    folder_path = path_join(session_manager.session_folder(),
                            constants.RESULTS_FOLDER)
    if not os.path.isdir(folder_path):
        makedirs(folder_path)
    out_file_path = path_join(folder_path, 'RWresults' + extension)

    rows = ["" for _ in range(len(data_list[0]))]

    with open(out_file_path, 'w', encoding='utf-8') as out_file:
        for i in range(len(data_list)):

            for j in range(len(data_list[i])):
                rows[j] = rows[j] + str(data_list[i][j]) + deliminator

        for i in range(len(rows)):
            out_file.write(rows[i] + '\n')
    out_file.close()

    return out_file_path, extension
예제 #5
0
    def zip_workspace(self) -> str:
        """Sends a zip file containing a pickle file of session & its folder.

        :return: the path of the zipped workspace
        """
        # TODO: move this to matrix model
        # initialize the save path
        save_path = os.path.join(constants.UPLOAD_FOLDER,
                                 constants.WORKSPACE_DIR)
        rounded_next_id = str(self.next_id % 10000)  # take the last 4 digit
        workspace_file_path = os.path.join(
            constants.UPLOAD_FOLDER,
            rounded_next_id + '_' + constants.WORKSPACE_FILENAME)

        # remove unnecessary content in the workspace
        try:
            shutil.rmtree(
                os.path.join(session_manager.session_folder(),
                             constants.RESULTS_FOLDER))
            # attempt to remove result folder(CSV matrix that kind of crap)
        except FileNotFoundError:
            pass

        # move session folder to work space folder
        try:
            # try to remove previous workspace in order to resolve conflict
            os.remove(workspace_file_path)
        except FileNotFoundError:
            pass
        try:
            # empty the save path in order to resolve conflict
            shutil.rmtree(save_path)
        except FileNotFoundError:
            pass
        general_functions.copy_dir(session_manager.session_folder(), save_path)

        # save session in the work space folder
        session_manager.save(save_path)

        # zip the dir
        zip_file = zipfile.ZipFile(workspace_file_path, 'w')
        general_functions.zip_dir(save_path, zip_file)
        zip_file.close()
        # remove the original dir
        shutil.rmtree(save_path)

        return workspace_file_path
예제 #6
0
    def update_workspace(self):
        """Updates the whole work space."""

        # update the savepath of each file
        for l_file in list(self.files.values()):
            l_file.save_path = pathjoin(session_manager.session_folder(),
                                        constants.FILE_CONTENTS_FOLDER,
                                        str(l_file.id) + '.txt')
        # update the session
        session_manager.load()
예제 #7
0
    def update_workspace(self):
        """Updates the whole work space."""

        # update the savepath of each file
        for l_file in list(self.files.values()):
            l_file.savePath = pathjoin(
                session_manager.session_folder(),
                constants.FILE_CONTENTS_FOLDER,
                str(l_file.id) + '.txt')
        # update the session
        session_manager.load()
예제 #8
0
def big_pca():
    """Reads the big image of the PCA and displays it on the web browser.

    :return: a response object with the big PCA as a png to flask and
    eventually to the browser.
    """
    if constants.PCA_BIG_GRAPH_FILENAME:
        folder = path_join(session_manager.session_folder(),
                           constants.RESULTS_FOLDER)
        plotly_url = os.path.join(folder, constants.PCA_BIG_GRAPH_FILENAME)
        return send_file(plotly_url)
예제 #9
0
    def handle_upload_workspace(self):
        """Handles the session when you upload a workspace (.lexos) file."""

        # save .lexos file
        save_path = os.path.join(constants.UPLOAD_FOLDER,
                                 constants.WORKSPACE_DIR)
        save_file = os.path.join(save_path, str(self.next_id) + '.zip')
        try:
            os.makedirs(save_path)
        except FileExistsError:
            pass
        f = open(save_file, 'wb')
        f.write(request.data)
        f.close()

        # clean the session folder
        shutil.rmtree(session_manager.session_folder())

        # extract the zip
        upload_session_path = os.path.join(
            constants.UPLOAD_FOLDER,
            str(self.next_id) + '_upload_work_space_folder')
        with zipfile.ZipFile(save_file) as zf:
            zf.extractall(upload_session_path)
        general_functions.copy_dir(upload_session_path,
                                   session_manager.session_folder())

        # remove temp
        shutil.rmtree(save_path)
        shutil.rmtree(upload_session_path)

        try:
            # if there is no file content folder make one.
            # this dir will be lost during download(zip) if your original file
            # content folder does not contain anything.
            os.makedirs(
                os.path.join(session_manager.session_folder(),
                             constants.FILE_CONTENTS_FOLDER))
        except FileExistsError:
            pass
예제 #10
0
    def handle_upload_workspace(self):
        """Handles the session when you upload a workspace (.lexos) file."""

        # save .lexos file
        save_path = os.path.join(constants.UPLOAD_FOLDER,
                                 constants.WORKSPACE_DIR)
        save_file = os.path.join(save_path, str(self.next_id) + '.zip')
        try:
            os.makedirs(save_path)
        except FileExistsError:
            pass
        f = open(save_file, 'wb')
        f.write(request.data)
        f.close()

        # clean the session folder
        shutil.rmtree(session_manager.session_folder())

        # extract the zip
        upload_session_path = os.path.join(
            constants.UPLOAD_FOLDER, str(
                self.next_id) + '_upload_work_space_folder')
        with zipfile.ZipFile(save_file) as zf:
            zf.extractall(upload_session_path)
        general_functions.copy_dir(upload_session_path,
                                   session_manager.session_folder())

        # remove temp
        shutil.rmtree(save_path)
        shutil.rmtree(upload_session_path)

        try:
            # if there is no file content folder make one.
            # this dir will be lost during download(zip) if your original file
            # content folder does not contain anything.
            os.makedirs(os.path.join(session_manager.session_folder(),
                                     constants.FILE_CONTENTS_FOLDER))
        except FileExistsError:
            pass
예제 #11
0
def k_means_image():
    """Reads the png image of the kmeans and displays it on the web browser.

    *kmeansimage() linked to in analysis.html, displaying the kmeansimage.png
    :return: a response object with the kmeansimage png to flask and
     eventually to the browser.
    """
    # kmeansimage() is called in kmeans.html, displaying the
    # KMEANS_GRAPH_FILENAME (if session['kmeansdatagenerated'] != False).
    image_path = path_join(session_manager.session_folder(),
                           constants.RESULTS_FOLDER,
                           constants.KMEANS_GRAPH_FILENAME)
    return send_file(image_path)
예제 #12
0
def save_file_manager(file_manager: FileManager):
    """
    Saves the file manager to the hard drive.

    Args:
        file_manager: File manager object to be saved.

    Returns:
        None
    """

    file_manager_path = os.path.join(session_folder(),
                                     constants.FILEMANAGER_FILENAME)
    pickle.dump(file_manager, open(file_manager_path, 'wb'))
예제 #13
0
    def __init__(self):
        """Class for object to hold info about user's files & choices in Lexos.

        Each user will have their own unique instance of the
        FileManager. A major data attribute of this class is a dictionary
        holding the LexosFile objects, each representing an uploaded file to be
        used in Lexos. The key for the dictionary is the unique ID of the file,
        with the value being the corresponding LexosFile object.
        """

        self._files = {}
        self.next_id = 0

        makedirs(pathjoin(session_manager.session_folder(),
                          constants.FILE_CONTENTS_FOLDER))
예제 #14
0
    def scrub_contents(self, saving_changes: bool) -> str:
        """ Scrubs the contents of the file according to the user's options

        May save the changes or not.
        :param saving_changes: boolean saying whether or not to save the
                               changes made.
        :return: a preview string of the possibly changed file.
        """

        storage_options = []
        for key in list(request.form.keys()):
            if 'usecache' in key:
                storage_options.append(key[len('usecache'):])

        if 'scrub' not in self.options:
            self.options['scrub'] = {}
        scrub_options = self.get_scrub_options()

        text_strfile_managering = self.load_contents()

        text_string = scrubber.scrub(
            text_strfile_managering,
            gutenberg=self.is_gutenberg,
            lower=scrub_options['lowercasebox'],
            punct=scrub_options['punctuationbox'],
            apos=scrub_options['aposbox'],
            hyphen=scrub_options['hyphensbox'],
            amper=scrub_options['ampersandbox'],
            digits=scrub_options['digitsbox'],
            tags=scrub_options['tagbox'],
            white_space=scrub_options['whitespacebox'],
            spaces=scrub_options['spacesbox'],
            tabs=scrub_options['tabsbox'],
            new_lines=scrub_options['newlinesbox'],
            opt_uploads=request.files,
            storage_options=storage_options,
            storage_folder=session_manager.session_folder() + '/scrub/',
            previewing=not saving_changes)

        if saving_changes:
            self.save_contents(text_string)
            self.save_scrub_options()

        # renew the preview
        self.contents_preview = self.generate_preview(text_string)
        text_string = self.contents_preview

        return text_string
예제 #15
0
    def scrub_contents(self, saving_changes: bool) -> str:
        """ Scrubs the contents of the file according to the user's options

        May save the changes or not.
        :param saving_changes: boolean saying whether or not to save the
                               changes made.
        :return: a preview string of the possibly changed file.
        """

        storage_options = []
        for key in list(request.form.keys()):
            if 'usecache' in key:
                storage_options.append(key[len('usecache'):])

        if 'scrub' not in self.options:
            self.options['scrub'] = {}
        scrub_options = self.get_scrub_options()

        text_strfile_managering = self.load_contents()

        text_string = scrubber.scrub(
            text_strfile_managering,
            gutenberg=self.is_gutenberg,
            lower=scrub_options['lowercasebox'],
            punct=scrub_options['punctuationbox'],
            apos=scrub_options['aposbox'],
            hyphen=scrub_options['hyphensbox'],
            amper=scrub_options['ampersandbox'],
            digits=scrub_options['digitsbox'],
            tags=scrub_options['tagbox'],
            white_space=scrub_options['whitespacebox'],
            spaces=scrub_options['spacesbox'],
            tabs=scrub_options['tabsbox'],
            new_lines=scrub_options['newlinesbox'],
            opt_uploads=request.files,
            storage_options=storage_options,
            storage_folder=session_manager.session_folder() + '/scrub/',
            previewing=not saving_changes)

        if saving_changes:
            self.save_contents(text_string)
            self.save_scrub_options()

        # renew the preview
        self.contents_preview = self.generate_preview(text_string)
        text_string = self.contents_preview

        return text_string
예제 #16
0
    def __init__(self):
        """Class for object to hold info about user's files & choices in Lexos.

        Each user will have their own unique instance of the
        FileManager. A major data attribute of this class is a dictionary
        holding the LexosFile objects, each representing an uploaded file to be
        used in Lexos. The key for the dictionary is the unique ID of the file,
        with the value being the corresponding LexosFile object.
        """

        self._files = {}
        self.next_id = 0

        makedirs(
            pathjoin(session_manager.session_folder(),
                     constants.FILE_CONTENTS_FOLDER))
예제 #17
0
def load_file_manager() -> FileManager:
    """
    Loads the file manager for the specific session from the hard drive.

    Args:
        None

    Returns:
        The file manager object for the session.
    """

    file_manager_path = os.path.join(session_folder(),
                                     constants.FILEMANAGER_FILENAME)

    file_manager = pickle.load(open(file_manager_path, 'rb'))

    return file_manager
예제 #18
0
def generate_rw_matrix_plot(data_points: List[List[List[int]]],
                            legend_labels_list: List[str]) -> Tuple[str, str]:
    """
    Generates rolling windows graph raw data matrix

    Args:
        data_points: a list of [x, y] points

    Returns:
        Output file path and extension.
    """

    extension = '.csv'
    deliminator = ','

    folder_path = path_join(session_manager.session_folder(),
                            constants.RESULTS_FOLDER)
    if not os.path.isdir(folder_path):
        makedirs(folder_path)
    out_file_path = path_join(folder_path, 'RWresults' + extension)

    max_len = 0
    for i in range(len(data_points)):
        if len(data_points[i]) > max_len:
            max_len = len(data_points[i])
    max_len += 1

    rows = [""] * max_len

    legend_labels_list[0] = legend_labels_list[0].split('#')

    rows[0] = (deliminator + deliminator).join(
        legend_labels_list[0]) + deliminator + deliminator

    with open(out_file_path, 'w', encoding='utf-8') as out_file:
        for i in range(len(data_points)):
            for j in range(1, len(data_points[i]) + 1):
                rows[j] = rows[j] + str(
                    data_points[i][j - 1][0]) + deliminator + str(
                        data_points[i][j - 1][1]) + deliminator

        for i in range(len(rows)):
            out_file.write(rows[i] + '\n')
    out_file.close()

    return out_file_path, extension
예제 #19
0
    def download_rwa(self) -> str:
        """Download rolling window analysis result as CSV file.

        :return: The directory of the saved CSV file.
        """
        # Get the default saving directory of rolling window result.
        result_folder_path = os.path.join(
            session_manager.session_folder(), RESULTS_FOLDER)

        # Attempt to make the directory.
        if not os.path.isdir(result_folder_path):
            os.makedirs(result_folder_path)

        # Get the complete saving path of rolling window result.
        save_path = os.path.join(result_folder_path, "rolling_window.csv")

        self._get_rwa_csv_frame().to_csv(path_or_buf=save_path,
                                         index_label="# Window",
                                         na_rep="NA")

        return save_path
    def download_rwa(self) -> str:
        """Download rolling window analysis result as CSV file.

        :return: The directory of the saved CSV file.
        """
        # Get the default saving directory of rolling window result.
        result_folder_path = os.path.join(session_manager.session_folder(),
                                          RESULTS_FOLDER)

        # Attempt to make the directory.
        if not os.path.isdir(result_folder_path):
            os.makedirs(result_folder_path)

        # Get the complete saving path of rolling window result.
        save_path = os.path.join(result_folder_path, "rolling_window.csv")

        self._get_rwa_csv_frame().to_csv(path_or_buf=save_path,
                                         index_label="# Window",
                                         na_rep="NA")

        return save_path
예제 #21
0
    def __init__(self, original_filename: str,
                 file_name: str, file_string: str, file_id: int):
        """Class for an object to hold all info about a specific uploaded file.

        Each uploaded file will be stored in a unique object, and accessed
        through the FileManager files dictionary. A major data attribute of
        this class is a string that (sometimes) contains the text contents of
        the file (Most of the time).
        This newly constructed LexosFile object is created from the information
        passed in, and performs some preliminary processing.
        :param original_filename: the original file name of the uploaded file.
        :param file_name: the file name we store.
        :param file_string: contents of the file's text.
        :param file_id: the ID to assign to the new file.
        """

        self.doc_type = 'text'  # default doc type
        self.id = file_id
        self.original_source_filename = original_filename
        self.name = file_name
        self.contents_preview = self.generate_preview(file_string)
        self.save_path = pathjoin(
            session_manager.session_folder(),
            constants.FILE_CONTENTS_FOLDER, str(self.id) + '.txt')
        self.save_contents(file_string)

        self.active = True
        self.class_label = ''

        split_name = self.name.split('.')

        self.label = '.'.join(split_name[:-1])

        self.set_type_from(split_name[-1], file_string)

        self.has_tags = self.check_for_tags(file_string)

        self.is_gutenberg = self.check_for_gutenberg(file_string)

        self.options = {}
예제 #22
0
    def __init__(self, original_filename: str,
                 file_name: str, file_string: str, file_id: int):
        """Class for an object to hold all info about a specific uploaded file.

        Each uploaded file will be stored in a unique object, and accessed
        through the FileManager files dictionary. A major data attribute of
        this class is a string that (sometimes) contains the text contents of
        the file (Most of the time).
        This newly constructed LexosFile object is created from the information
        passed in, and performs some preliminary processing.
        :param original_filename: the original file name of the uploaded file.
        :param file_name: the file name we store.
        :param file_string: contents of the file's text.
        :param file_id: the ID to assign to the new file.
        """

        self.doc_type = 'text'  # default doc type
        self.id = file_id
        self.original_source_filename = original_filename
        self.name = file_name
        self.contents_preview = self.generate_preview(file_string)
        self.save_path = pathjoin(
            session_manager.session_folder(),
            constants.FILE_CONTENTS_FOLDER, str(self.id) + '.txt')
        self.save_contents(file_string)

        self.active = True
        self.class_label = ''

        split_name = self.name.split('.')

        self.label = '.'.join(split_name[:-1])

        self.set_type_from(split_name[-1], file_string)

        self.has_tags = self.check_for_tags(file_string)

        self.is_gutenberg = self.check_for_gutenberg(file_string)

        self.options = {}
예제 #23
0
    def download_dtm(self) -> str:
        """Download the desired DTM as a CSV file.

        :return: The file path that saves the CSV file.
        """
        # Select proper DTM based on users choice.
        required_dtm = self._get_file_col_dtm() \
            if self._front_end_option.orientation == "file_col" \
            else self._get_file_row_dtm()

        # Get the default folder path, if it does not exist, create one.
        folder_path = os.path.join(session_folder(), RESULTS_FOLDER)
        if not os.path.isdir(folder_path):
            os.makedirs(folder_path)

        # Set the default file path.
        file_path = os.path.join(folder_path, "tokenizer_result.csv")

        # Save it to the file path.
        required_dtm.to_csv(file_path)

        # Return where the file is.
        return file_path
예제 #24
0
    def get_topword_csv_path(self, class_division_map: pd.DataFrame) -> str:
        """Write the generated top word results to an output CSV file.

        :param class_division_map: a pandas data frame where:
            - the data is the division map with boolean values that indicate
              which class each file belongs to.
            - the index is the class labels.
            - the column is the file id.
        :return: path of the generated CSV file.
        """
        # Make the path.
        result_folder_path = os.path.join(session_manager.session_folder(),
                                          RESULTS_FOLDER)

        # Attempt to make the save path directory.
        try:
            os.makedirs(result_folder_path)
        except OSError:
            pass

        # Get the path to save file.
        save_path = os.path.join(result_folder_path, TOPWORD_CSV_FILE_NAME)

        # Get topword result.
        topword_result = \
            self._get_result(class_division_map=class_division_map)

        with open(save_path, 'w', encoding='utf-8') as f:
            # Write header to the file.
            f.write(topword_result.header + '\n')
            # Write results to the file.
            # Since we want indexes and data in rows, we get the transpose.
            for result in topword_result.results:
                f.write(pd.DataFrame(result).transpose().to_csv(header=True))

        return save_path
예제 #25
0
def generate_mc_json_obj(file_manager: FileManager):
    """
    Generates a JSON object for multicloud when working with a mallet .txt file

    Args:
        malletPath: path to the saved mallet .txt file

    Returns:
        An object, formatted in the JSON that d3 needs, either a list or a
        dictionary.
    """

    content_path = os.path.join(session_manager.session_folder(),
                                constants.FILE_CONTENTS_FOLDER,
                                constants.MALLET_INPUT_FILE_NAME)
    output_path = os.path.join(session_manager.session_folder(),
                               constants.RESULTS_FOLDER,
                               constants.MALLET_OUTPUT_FILE_NAME)
    try:
        makedirs(
            path_join(session_manager.session_folder(),
                      constants.RESULTS_FOLDER))
        # attempt to make the result dir
    except FileExistsError:
        pass  # result dir already exists

    if request.form['analysistype'] == 'userfiles':

        json_obj = generate_json_for_d3(file_manager, merged_set=False)

    else:  # request.form['analysistype'] == 'topicfile'

        topic_string = str(request.files['optuploadname'])
        topic_string = re.search(r"'(.*?)'", topic_string)
        topic_string = topic_string.group(1)

        if topic_string != '':
            request.files['optuploadname'].save(content_path)

        with open(content_path, 'r', encoding='utf-8') as f:
            content = f.read()  # reads content from the upload file
            # Coerce to non UTF-8 files to UTF-8
            encoding = general_functions.get_encoding(content)
            if encoding != 'utf-8':
                content = content.decode(encoding).encode('utf-8')

        if content.startswith('#doc source pos typeindex type topic'):
            # begin converting a Mallet file into the file d3 can understand
            tuples = []
            # Read the output_state file
            with open(content_path, encoding='utf-8') as f:
                # Skip the first three lines
                for _ in range(3):
                    next(f)
                # Create a list of type:topic combinations
                for line in f:
                    # Make sure the number of columns is correct
                    line = re.sub(r'\s+', ' ', line)
                    try:
                        doc, source, pos, type_index, doc_type, topic = \
                            line.rstrip().split(' ')
                        type_topic_combination = doc_type + ':' + topic
                        tuples.append(type_topic_combination)
                    except BaseException:
                        raise Exception(
                            "Your source data cannot be parsed into a regular "
                            "number of columns. Please ensure that there are "
                            "no spaces in your file names or file paths. It; "
                            "may be easiest to open the outpt_state file in a "
                            "spreadsheet using a space as; the delimiter and "
                            "text as the field type. Data should only be "
                            "present in columns; A to F. Please fix any "
                            "misaligned data and run this script again.")

            # Count the number of times each type-topic combo appears
            from collections import defaultdict

            topic_count = defaultdict(int)
            for x in tuples:
                topic_count[x] += 1

            # Populate a topic_counts dict with type: topic:count
            words = []
            topic_counts = {}
            for k, v in topic_count.items():
                doc_type, topic = k.split(':')
                count = int(v)
                tc = topic + ":" + str(count)
                if doc_type in words:
                    topic_counts[doc_type] = topic_counts[doc_type] + " " + tc
                else:
                    topic_counts[doc_type] = tc
                words.append(doc_type)

            # Add a word ID
            out = ""
            i = 0
            for k, v in topic_counts.items():
                out += str(i) + " " + k + " " + v + "\n"
                i += 1

            # Write the output file
            with open(output_path, 'w', encoding='utf-8') as f:
                f.write(out)  # Python will convert \n to os.linesep
                # end converting a Mallet file into the file d3 can understand
        else:
            with open(output_path, 'w', encoding='utf-8') as f:
                # if this is the json form,
                # just write that in the output folder
                f.write(content)

        json_obj = multicloud_topic.topic_json_maker(output_path)

    return json_obj
예제 #26
0
 def _file_manager_path(self) -> str:
     """Get the path of the file manager pickle file."""
     return os.path.join(session_folder(), constants.FILEMANAGER_FILENAME)
예제 #27
0
def generate_k_means_voronoi(file_manager: FileManager):
    """
    Generates a table of cluster_number and file name from the active files.

    Args:
        None

    Returns:
        kmeans_index: a list of index of the closest center of the file
        siltt_score: a float of silhouette score based on KMeans algorithm
        file_name_str: a string of file names, separated by '#'
        k_value: an int of the number of K from input
    """

    ngram_size, use_word_tokens, use_freq, use_tfidf, norm_option, grey_word, \
        show_grey_word, only_char_grams_within_words, mfw, culling = \
        file_manager.get_matrix_options_deprec()

    count_matrix = file_manager.get_matrix_deprec(
        use_word_tokens=use_word_tokens,
        use_tfidf=False,
        norm_option=norm_option,
        only_char_grams_within_words=only_char_grams_within_words,
        n_gram_size=ngram_size,
        use_freq=False,
        grey_word=grey_word,
        show_grey_word=show_grey_word,
        mfw=mfw,
        cull=culling)

    del count_matrix[0]
    for row in count_matrix:
        del row[0]

    matrix = np.array(count_matrix)

    # Gets options from request.form and uses options to generate the K-mean
    # results
    k_value = len(file_manager.get_active_files()) / 2  # default K value
    max_iter = 300  # default number of iterations
    init_method = request.form['init']
    n_init = 300
    tolerance = 1e-4

    if (request.form['nclusters'] != '') and (int(request.form['nclusters']) !=
                                              k_value):
        k_value = int(request.form['nclusters'])
    if (request.form['max_iter'] != '') and (int(request.form['max_iter']) !=
                                             max_iter):
        max_iter = int(request.form['max_iter'])
    if request.form['n_init'] != '':
        n_init = int(request.form['n_init'])
    if request.form['tolerance'] != '':
        tolerance = float(request.form['tolerance'])

    metric_dist = request.form['KMeans_metric']

    file_name_list = []
    for l_file in list(file_manager.files.values()):
        if l_file.active:
            if request.form["file_" + str(l_file.id)] == l_file.label:
                file_name_list.append(l_file.label)
            else:
                new_label = request.form["file_" + str(l_file.id)]
                file_name_list.append(new_label)
    file_name_str = file_name_list[0]

    for i in range(1, len(file_name_list)):
        file_name_str += "#" + file_name_list[i]

    folder_path = path_join(session_manager.session_folder(),
                            constants.RESULTS_FOLDER)
    if not os.path.isdir(folder_path):
        makedirs(folder_path)

    kmeans_index, siltt_score, color_chart, final_points_list, \
        final_centroids_list, text_data, max_x = KMeans.get_k_means_voronoi(
            matrix, k_value, max_iter, init_method, n_init, tolerance,
            metric_dist, file_name_list)

    return kmeans_index, siltt_score, file_name_str, k_value, color_chart, \
        final_points_list, final_centroids_list, text_data, max_x
예제 #28
0
def generate_csv(file_manager: FileManager) -> Tuple[str, str]:
    """
    Generates a CSV file from the active files.

    Args:
        None

    Returns:
        The filepath where the CSV was saved, and the chosen extension
        (.csv or .tsv) for the file.
    """
    transpose = request.form['csvorientation'] == 'filerow'
    use_tsv = request.form['csvdelimiter'] == 'tab'
    extension = '.tsv' if use_tsv else '.csv'

    count_matrix = generate_csv_matrix(file_manager)

    delimiter = '\t' if use_tsv else ','

    # add quotes to escape the tab and comma in csv and tsv
    if transpose:
        # escape all the file name
        count_matrix[0] = [
            '"' + file_name + '"' for file_name in count_matrix[0]
        ]
    else:
        # escape all the file name
        count_matrix[0] = [
            '"' + file_name + '"' for file_name in count_matrix[0]
        ]
    count_matrix = list(zip(*count_matrix))  # transpose the matrix
    # escape all the comma and tab in the word, and makes the leading item
    # empty string.
    count_matrix[0] = [''] + ['"' + word + '"' for word in count_matrix[0][1:]]
    count_matrix = list(zip(*count_matrix))  # transpose the matrix back

    folder_path = path_join(session_manager.session_folder(),
                            constants.RESULTS_FOLDER)
    if not os.path.isdir(folder_path):
        makedirs(folder_path)
    out_file_path = path_join(folder_path, 'results' + extension)

    # Write results to output file, and write class labels depending on
    # transpose
    class_label_list = ["Class Label"]
    for l_file in list(file_manager.files.values()):
        if l_file.active:
            class_label_list.append(l_file.class_label)

    with open(out_file_path, 'w', encoding='utf-8') as out_file:
        for i, row in enumerate(count_matrix):
            row_str = delimiter.join([str(item) for item in row])
            if transpose:
                row_str += delimiter + class_label_list[i]

            out_file.write(row_str + '\n')

        if not transpose:
            out_file.write(delimiter.join(class_label_list) + '\n')
    out_file.close()

    return out_file_path, extension
예제 #29
0
 def _file_manager_path(self) -> str:
     """Get the path of the file manager pickle file."""
     return os.path.join(session_folder(), constants.FILEMANAGER_FILENAME)