def updateCharacterList(self):
        """Update character list based on Charnetto output."""
        # Sanity checks...
        if not self.model or not self.inputSeg:
            return
        
        # Init UI...
        self.controlArea.setDisabled(True)
        progressBar = ProgressBar(self, iterations=4)
        
        # Get input strings...
        strings = [segment.get_content() for segment in self.inputSeg]
        progressBar.advance()
        
        # Extract character tokens...
        # if self.sourceType == "Plain text":
            # self.char_df = charnetto.extract_spacy_df(strings, self.nlp)
        # elif self.sourceType == "IMSDB-formatted script":
            # self.char_df = charnetto.extract_movie_df(" ".join(strings))
        self.char_df = charnetto.extract_spacy_df(strings, self.nlp)
        
        # TODO deal with \n in names
        progressBar.advance()
        
        # Unify spaCy tags to match those of flair...  
        self.char_df = charnetto.unify_tags(self.char_df)
        progressBar.advance()
        
        # Collapse characters whose name is the prefix of another.
        self.char_list = charnetto.concatenate_parents(self.char_df, min_occ = 1)

        # Build char list and reset UI.
        self.characters = [", ".join(char) for char in self.char_list]
        progressBar.advance()
        progressBar.finish()
        self.controlArea.setDisabled(False)
        
        # Cache character list for resetting if needed.
        self.cachedCaracters = self.characters[:]
    def sendData(self):
        """Compute result of widget processing and send to output."""

        # Check that there's a model...
        if not self.model:
            self.noLanguageModelWarning()
            self.sendNoneToOutputs()
            return

        # Check that there's an input...
        if self.inputSeg is None:
            self.infoBox.setText("Widget needs input.", "warning")
            self.sendNoneToOutputs()
            return
       
        # Initialize progress bar.
        self.infoBox.setText(
            u"Processing, please wait...", 
            "warning",
        )

        # Disable control area and initialize progress bar...
        self.controlArea.setDisabled(True)
        progressBar = ProgressBar(self, iterations=len(self.char_df))       

        # Get start and end pos of concatenated input segments...
        startPositions = [0]
        endPositions = list()
        numSegments = len(self.inputSeg)
        for idx in range(1, numSegments):
            prevSegLen = len(self.inputSeg[idx-1].get_content())
            startPositions.append(startPositions[-1] + prevSegLen + 1)
            endPositions.append(startPositions[-1] - 1)
        endPositions.append(startPositions[-1] + 
                            len(self.inputSeg[-1].get_content()) + 1)

        # Get or update character aliases...
        find_pairs = sys.modules['charnetto.find_pairs']
        characters = [entry.split(", ") for entry in self.characters]
        find_pairs.map_names(self.char_df, characters)

        # Initializations...
        charSegments = list()
        currentSegmentIdx = 0
                
        # For each character token in Charnetto's output...
        for index, charToken in self.char_df.iterrows():
        
            # Skip non-PER named entities.
            if charToken["tag"] != "PER":
                continue

            # Get index of containing segment...
            while charToken["end_pos"] > endPositions[currentSegmentIdx]:
                currentSegmentIdx += 1
                
            # Create segment for char with its actual coordinates...
            strIndex = self.inputSeg[currentSegmentIdx].str_index
            start = charToken["start_pos"]-startPositions[currentSegmentIdx]
            end = charToken["end_pos"]-startPositions[currentSegmentIdx]
            annotations = {"id": charToken["alias"]}
            charSegments.append(Segment(strIndex, start, end, annotations))
            
            progressBar.advance()

        # Send output...
        outputSegmentation = Segmentation(charSegments, 
                                           label=self.captionTitle)
        self.send("Character segmentation", outputSegmentation, self)
        print(outputSegmentation.to_string())

        # Set status to OK and report data size...
        message = "%i segment@p sent to output." % len(outputSegmentation)
        message = pluralize(message, len(outputSegmentation))
        self.infoBox.setText(message)
        
        # Clear progress bar.
        progressBar.finish()
        self.controlArea.setDisabled(False)
                
        self.sendButton.resetSettingsChangedFlag()             
    def searchFunction(self):
        """Search from website Genius"""

        result_list = {}
        query_string = self.newQuery

        if query_string != "":
            page = 1
            page_max = int(self.nbr_results) / 10
            result_id = 0
            result_artist = []

            self.controlArea.setDisabled(True)

            # Initialize progress bar.
            progressBar = ProgressBar(self, iterations=page_max)

            while page <= page_max:
                values = {'q': query_string, 'page': page}
                data = urllib.parse.urlencode(values)
                query_url = 'http://api.genius.com/search?' + data
                json_obj = self.url_request(query_url)
                body = json_obj["response"]["hits"]

                # Each result is stored in a dictionnary with its title,
                # artist's name, artist's ID and URL path
                for result in body:
                    result_id += 1
                    title = result["result"]["title"]
                    artist = result["result"]["primary_artist"]["name"]
                    artist_id = result["result"]["primary_artist"]["id"]
                    path = result["result"]["path"]
                    result_list[result_id] = {
                        'artist': artist,
                        'artist_id': artist_id,
                        'path': path,
                        'title': title
                    }
                page += 1

                # 1 tick on the progress bar of the widget
                progressBar.advance()
            # Stored the results list in the "result_list" variable
            self.searchResults = result_list

            # Reset and clear the visible widget list
            del self.titleLabels[:]

            # Update the results list with the search results
            # in order to display them
            for idx in self.searchResults:
                result_string = self.searchResults[idx]["title"] + " - " + \
                                self.searchResults[idx]["artist"]
                self.titleLabels.append(result_string)

            self.titleLabels = self.titleLabels
            self.clearButton.setDisabled(False)
            self.addButton.setDisabled(self.selectedTitles == list())

            # Clear progress bar.
            progressBar.finish()
            self.controlArea.setDisabled(False)

        else:
            self.infoBox.setText("You didn't search anything", "warning")
    def sendData(self):
        """Compute result of widget processing and send to output"""
        # Skip if title list is empty:
        if self.myBasket == list():
            self.infoBox.setText(
                "Your corpus is empty, please add some songs first", "warning")
            return

        # Clear created Inputs.
        self.clearCreatedInputs()

        self.controlArea.setDisabled(True)

        # Initialize progress bar.
        progressBar = ProgressBar(self, iterations=len(self.myBasket))

        # Attempt to connect to Genius and retrieve lyrics...
        selectedSongs = list()
        song_content = list()
        annotations = list()
        try:
            for song in self.myBasket:
                # song is a dict {'idx1':{'title':'song1'...},
                # 'idx2':{'title':'song2'...}}
                page_url = "http://genius.com" + song['path']
                lyrics = self.html_to_text(page_url)
                song_content.append(lyrics)
                annotations.append(song.copy())
                # 1 tick on the progress bar of the widget
                progressBar.advance()

        # If an error occurs (e.g. http error, or memory error)...
        except:
            # Set Info box and widget to "error" state.
            self.infoBox.setText("Couldn't download data from Genius website.",
                                 "error")
            self.controlArea.setDisabled(False)
            return

        # Store downloaded lyrics strings in input objects...
        for song in song_content:
            newInput = Input(song, self.captionTitle)
            self.createdInputs.append(newInput)

        # If there"s only one play, the widget"s output is the created Input.
        if len(self.createdInputs) == 1:
            self.segmentation = self.createdInputs[0]

        # Otherwise the widget"s output is a concatenation...
        else:
            self.segmentation = Segmenter.concatenate(
                self.createdInputs,
                self.captionTitle,
                import_labels_as=None,
            )

        # Annotate segments...
        for idx, segment in enumerate(self.segmentation):
            segment.annotations.update(annotations[idx])
            self.segmentation[idx] = segment

        # Clear progress bar.
        progressBar.finish()

        self.controlArea.setDisabled(False)

        # Set status to OK and report data size...
        message = "%i segment@p sent to output " % len(self.segmentation)
        message = pluralize(message, len(self.segmentation))
        numChars = 0
        for segment in self.segmentation:
            segmentLength = len(Segmentation.get_data(segment.str_index))
            numChars += segmentLength
        message += "(%i character@p)." % numChars
        message = pluralize(message, numChars)
        self.infoBox.setText(message)

        self.send("Lyrics importation", self.segmentation, self)
        self.sendButton.resetSettingsChangedFlag()
Exemplo n.º 5
0
    def sendData(self):
        """Compute result of widget processing and send to output"""

        # Clear morphology...
        self.morphology = dict()

        # Check that there's an input...
        if self.inputSeg is None:
            self.infoBox.setText("Widget needs input", "warning")
            self.send("Morphologically analyzed data", None, self)
            self.updateGUI()
            return

        # Perform morphological analysis...

        # Initialize progress bar.
        self.infoBox.setText(
            u"Processing, please wait (word count)...",
            "warning",
        )
        self.controlArea.setDisabled(True)
        progressBar = ProgressBar(self, iterations=100)

        # Word count...
        wordCounts = collections.Counter(
            [segment.get_content() for segment in self.inputSeg])
        self.morphology["wordCounts"] = wordCounts
        self.infoBox.setText(
            u"Processing, please wait (signature extraction)...",
            "warning",
        )
        progressBar.advance(5)  # 5 ticks on the progress bar...

        # Learn signatures...
        try:
            lxa5crab.crab_nebula.MIN_STEM_LEN = self.minStemLen
            signatures, stems, suffixes = lxa5crab.find_signatures(wordCounts)
            self.morphology["signatures"] = signatures
            self.morphology["stems"] = stems
            self.morphology["suffixes"] = suffixes
        except ValueError as e:
            self.infoBox.setText(e.__str__(), "warning")
            self.send("Morphologically analyzed data", None, self)
            self.controlArea.setDisabled(False)
            progressBar.finish()  # Clear progress bar.
            self.morphology = dict()
            self.updateGUI()
            return
        self.infoBox.setText(
            u"Processing, please wait (word parsing)...",
            "warning",
        )
        progressBar.advance(80)

        # Parse words...
        parser = lxa5crab.build_parser(wordCounts, signatures, stems, suffixes)
        self.morphology["parser"] = parser
        newSegments = list()
        num_analyzed_words = 0
        for segment in self.inputSeg:
            parses = parser[segment.get_content()]
            newSegment = segment.deepcopy()
            if parses[0].signature:
                num_analyzed_words += 1
            newSegment.annotations.update(
                {
                    "stem": parses[0].stem,
                    "suffix": parses[0].suffix  \
                                if len(parses[0].suffix) else "NULL",
                    "signature": parses[0].signature
                }
            )
            newSegments.append(newSegment)
        self.send(
            "Morphologically analyzed data",
            Segmentation(newSegments, self.captionTitle),
            self,
        )
        self.updateGUI()
        progressBar.advance(15)

        # Set status to OK and report data size...
        message = "%i segment@p sent to output (%.2f%% analyzed)." % (len(
            self.inputSeg), (num_analyzed_words / len(self.inputSeg) * 100))
        message = pluralize(message, len(self.inputSeg))
        self.infoBox.setText(message)

        # Clear progress bar.
        progressBar.finish()
        self.controlArea.setDisabled(False)

        self.sendButton.resetSettingsChangedFlag()
Exemplo n.º 6
0
    def summarize(self, cv, content):
        "Main function that summarize the text"

        progressBar = ProgressBar(self, iterations=3)

        doc = self.nlp(content)

        corpus = [sent.text.lower() for sent in doc.sents]
        cv_fit = self.cv.fit_transform(corpus)

        # Count unique words and how many times they appear
        word_list = self.cv.get_feature_names()
        count_list = cv_fit.toarray().sum(axis=0)
        word_frequency = dict(zip(word_list, count_list))

        # Get sorted dict of word frequency and print the top to test
        val = sorted(word_frequency.values())
        higher_word_frequencies = [
            word for word, freq in word_frequency.items() if freq in val[-3:]
        ]

        # gets relative frequency of words to frequent words
        higher_frequency = val[-1]
        for word in word_frequency.keys():
            word_frequency[word] = (word_frequency[word] / higher_frequency)

        progressBar.advance()
        # Initialise a sentence dictionnary
        sentence_rank = {}

        # For each word in each sentence ...
        for sent in doc.sents:
            count = 0
            for word in sent:
                count += 1
                # if the word appears in word_frequency dict
                if word.text.lower() in word_frequency.keys():
                    # If the sentence is already in sentence_rank dict, we add points
                    if sent in sentence_rank.keys():
                        sentence_rank[sent] += word_frequency[
                            word.text.lower()]
                    # else we create a new key/value pair in dict
                    else:
                        sentence_rank[sent] = word_frequency[word.text.lower()]

            # Normalize: divide score of current sentence by number of words
            if sentence_rank.get(sent, None) != None:
                sentence_rank[sent] = (sentence_rank.get(sent) / count)

        progressBar.advance()

        # Sort sentences
        top_sentences = (sorted(sentence_rank.values())[::-1])
        # This is where we can choose how many sentences we want to keep for the summary
        # Depending on the choosen method: sentences or %
        if self.method == "Number of sentences":
            top_sent = top_sentences[:self.numSents]
        elif self.method == "Percentage of text lenght":
            percentSent = int(round(self.percentage * len(sentence_rank) /
                                    100))
            top_sent = top_sentences[:percentSent]

        summary = list()
        for sent, strength in sentence_rank.items():
            if strength in top_sent:
                summary.append(sent)
            else:
                continue

        progressBar.advance()

        #Summary contains spacy.tokens.span.Span that must be converted to string
        summary_str = [str(i) for i in summary]
        # Join all sentence in a single string
        resume = " ".join(summary_str)

        # Create HTML resume
        html_summary = list()
        for sent in doc.sents:
            if sent in summary:
                new_sent = '<b style=\'color:blue\'>' + str(sent) + '</b>'
                html_summary.append(new_sent)
            else:
                html_summary.append(sent)

        #html_summary contains spacy.tokens.span.Span that must be converted to string
        html_summary_str = [str(i) for i in html_summary]
        # Join all sentence in a single string
        html_resume = "<!DOCTYPE html>\n<html>\n<body>\n" + " ".join(
            html_summary_str) + "\n</body>\n</html>"

        progressBar.finish()

        # Create ouput segmentation from summary
        return Input(resume), Input(html_resume)
Exemplo n.º 7
0
    def sendData(self):
        """Compute result of widget processing and send to output"""
        if not self.importedCorpora:
            self.infoBox.setText("Please add a corpus to the selection.",
                                 "warning")
            self.send("Files", None, self)
            self.send("Utterances", None, self)
            return

        # Clear created Inputs and initialize progress bar...
        self.clearCreatedInputs()
        numberOfSteps = 2 if self.outputUtterances else 1
        numberOfSteps += 2 if self.outputWords else 0
        self.infoBox.setText(
            "(1/%i) Retrieving data, please wait..." % numberOfSteps,
            "warning",
        )
        self.controlArea.setDisabled(True)
        progressBar = ProgressBar(self, iterations=len(self.importedCorpora))

        annotations = list()

        # Iterate over corpora...
        for importedCorpus in self.importedCorpora:

            corpus = importedCorpus.split("/")[-1]

            # Try to retrieve corpus from cache...
            try:
                basepath = os.path.dirname(
                    os.path.abspath(inspect.getfile(inspect.currentframe())))
                corpusFilepath = os.path.normpath(
                    os.path.join(
                        basepath,
                        self.__class__.cachedFoldername,
                        importedCorpus[len(self.__class__.baseUrl):],
                    ))
                myZip = zipfile.ZipFile(corpusFilepath)

            except IOError:

                # Else try to download (and cache) requested zip file...
                try:
                    response = requests.get(importedCorpus)
                    myZip = zipfile.ZipFile(io.BytesIO(response.content))
                    corpusFolderpath = os.path.dirname(corpusFilepath)
                    try:
                        os.makedirs(corpusFolderpath)
                    except OSError:
                        pass
                    try:
                        outputFile = open(corpusFilepath, "wb")
                        outputFile.write(response.content)
                        outputFile.close()
                    except IOError:
                        pass

                # If an error occurs (e.g. connection error)...
                except:

                    # Set Info box and widget to "error" state.
                    self.infoBox.setText(
                        "Couldn't download corpus %s from CHILDES website." %
                        corpus, "error")

                    # Reset output channel.
                    self.send("Files", None, self)
                    self.send("Utterances", None, self)
                    progressBar.finish()
                    self.controlArea.setDisabled(False)
                    return

            # Create Input for each zipped file and store annotations...
            for file in myZip.infolist():
                file_content = myZip.read(file).decode('utf-8')

                # If word segmentation is requested...
                if self.outputWords:
                    # Implement replacements.
                    file_content = re.sub(
                        r"<w.+?(<replacement.+</replacement>).*?</w>",
                        r"\1",
                        file_content,
                    )
                    # Prepend pre-clitics.
                    file_content, n = re.subn(
                        r"(<mor .+?)(<mor-pre>.+</mor-pre>)",
                        r"\2\1",
                        file_content,
                    )
                    # Move <gra> into <mw>.
                    file_content, n = re.subn(
                        r"(</mw>)(<gra.+?/>)",
                        r"\2\1",
                        file_content,
                    )

                newInput = Input(file_content, self.captionTitle + "_files")
                self.createdInputs.append(newInput)
                chatSeg = Segmenter.import_xml(newInput, "CHAT")
                annotations.append(dict())
                annotations[-1]["file_path"] = file.filename
                for key in ["Corpus", "Lang", "PID"]:
                    try:
                        annotations[-1][key.lower()] =  \
                            chatSeg[0].annotations[key]
                    except KeyError:
                        pass
                participantListSeg = Segmenter.import_xml(
                    newInput, "Participants")
                recodedInput, _ = Segmenter.recode(
                    participantListSeg,
                    [(re.compile("/>"), "> </participant>")])
                participantSeg = Segmenter.import_xml(recodedInput,
                                                      "participant")
                targetChildData = list()
                for participant in participantSeg:
                    if participant.annotations["role"] != "Target_Child":
                        continue
                    targetChildData.append(dict())
                    if "age" in participant.annotations:
                        targetChildData[-1]["target_child_age"] =   \
                            participant.annotations["age"]
                        age_parse = re.search(
                            r"(\d+)Y(\d+)M(\d+)D",
                            participant.annotations["age"],
                        )
                        if age_parse:
                            targetChildData[-1]["target_child_years"] =     \
                                age_parse.group(1)
                            months = int(age_parse.group(2))   \
                                + 12 * int(age_parse.group(1))
                            targetChildData[-1]["target_child_months"] =     \
                            '%02d' % months
                            days = int(age_parse.group(3))   \
                                + 30 * months
                            targetChildData[-1]["target_child_days"] =     \
                            '%02d' % days
                    if "id" in participant.annotations:
                        targetChildData[-1]["target_child_id"] =   \
                            participant.annotations["id"]
                    if "sex" in participant.annotations:
                        targetChildData[-1]["target_child_sex"] =   \
                            participant.annotations["sex"]
                if len(targetChildData) == 1:
                    annotations[-1].update(targetChildData[0])

            progressBar.advance()

        # If there's only one file, the widget's output is the created Input...
        if len(self.createdInputs) == 1:
            self.fileSegmentation = self.createdInputs[0]

        # Otherwise the widget's output is a concatenation...
        else:
            self.fileSegmentation = Segmenter.concatenate(
                self.createdInputs,
                self.captionTitle + "_files",
                import_labels_as=None,
            )

        # Annotate segments...
        for idx, segment in enumerate(self.fileSegmentation):
            segment.annotations.update(annotations[idx])
            self.fileSegmentation[idx] = segment

        # Terminate progress bar...
        progressBar.finish()

        message = "%i file@p" % len(self.fileSegmentation)
        message = pluralize(message, len(self.fileSegmentation))
        self.send("Files", self.fileSegmentation, self)

        # Build utterance segmentation if needed...
        if self.outputUtterances:
            self.infoBox.setText(
                "(2/%i) Building utterance segmentation, please wait..."    \
                    % numberOfSteps,
                "warning",
            )
            progressBar = ProgressBar(self,
                                      iterations=len(self.fileSegmentation))
            self.utteranceSegmentation = Segmenter.import_xml(
                self.fileSegmentation,
                "u",
                progress_callback=progressBar.advance,
                label=self.captionTitle + "_utterances",
            )
            progressBar.finish()
            message += " and " if not self.outputWords else ", "
            message += "%i utterance@p" % len(self.utteranceSegmentation)
            message = pluralize(message, len(self.utteranceSegmentation))
            self.send("Utterances", self.utteranceSegmentation, self)
        else:
            self.send("Utterances", None, self)

        # Build word segmentation if needed...
        if self.outputWords:
            self.infoBox.setText(
                "(%i/%i) Building word segmentation, please wait..."    \
                    % (2 + (1 if self.outputUtterances else 0), numberOfSteps),
                "warning",
            )
            try:
                baseSegmentation = self.utteranceSegmentation
            except:
                baseSegmentation = self.fileSegmentation
            progressBar = ProgressBar(self,
                                      iterations=2 * len(baseSegmentation))
            wordSegmentation = Segmenter.import_xml(
                baseSegmentation,
                "w",
                progress_callback=progressBar.advance,
            )
            mwSegmentation = Segmenter.import_xml(
                baseSegmentation,
                "mw",
                progress_callback=progressBar.advance,
            )

            # Analyze words to extract annotations...
            self.infoBox.setText(
                "(%i/%i) Extracting word annotations, please wait..."    \
                    % (3 + (1 if self.outputUtterances else 0), numberOfSteps),
                "warning",
            )
            progressBar.finish()
            progressBar = ProgressBar(self, iterations=len(wordSegmentation))
            wordSegments = list()
            for word in wordSegmentation:
                mws = word.get_contained_segments(mwSegmentation)
                if mws:
                    for mw in mws:
                        wordSegment = word.deepcopy()
                        wordSegment.annotations.update(
                            self.extractWordAnnotations(mw))
                        wordSegments.append(wordSegment)
                else:
                    wordSegments.append(word)
                progressBar.advance()

            self.wordSegmentation = Segmentation(
                wordSegments,
                label=self.captionTitle + "_words",
            )

            message += " and %i word@p" % len(self.wordSegmentation)
            message = pluralize(message, len(self.wordSegmentation))
            self.send("Words", self.wordSegmentation, self)
        else:
            self.send("Words", None, self)

        # Set status to OK and report data size...
        message += " sent to output."
        message = pluralize(message, len(self.fileSegmentation))
        self.infoBox.setText(message)
        progressBar.finish()

        self.controlArea.setDisabled(False)

        self.sendButton.resetSettingsChangedFlag()
Exemplo n.º 8
0
    def sendData(self):
        """Compute result of widget processing and send to output"""

        # Check that there's an input...
        if self.inputSeg is None:
            self.infoBox.setText("Widget needs input", "warning")
            self.send("Linguistically analyzed data", None, self)
            return

        # Initialize progress bar.
        self.infoBox.setText(
            u"Processing, please wait...",
            "warning",
        )
        self.controlArea.setDisabled(True)
        progressBar = ProgressBar(self, iterations=len(self.inputSeg))

        tokenizedSegments = list()

        # Process each input segment...
        for segment in self.inputSeg:

            # Input segment attributes...
            inputContent = segment.get_content()
            inputAnnotations = segment.annotations
            inputString = segment.str_index
            inputStart = segment.start or 0
            inputEnd = segment.end or len(inputContent)

            # NLP analysis...
            doc = self.nlp(inputContent)

            # Process each token in input segment...
            for token in doc:
                tokenAnnotations = inputAnnotations.copy()
                tokenAnnotations.update({
                    k: getattr(token, k)
                    for k in RELEVANT_KEYS if getattr(token, k) is not None
                })
                tokenStart = inputStart + token.idx
                tokenizedSegments.append(
                    Segment(
                        str_index=inputString,
                        start=tokenStart,
                        end=tokenStart + len(token),
                        annotations=tokenAnnotations,
                    ))

            progressBar.advance()

        outputSeg = Segmentation(tokenizedSegments, self.captionTitle)

        # Set status to OK and report data size...
        message = "%i segment@p sent to output." % len(outputSeg)
        message = pluralize(message, len(outputSeg))
        self.infoBox.setText(message)

        print(outputSeg.to_string())

        # Clear progress bar.
        progressBar.finish()
        self.controlArea.setDisabled(False)

        # Send data to output...
        self.send("Linguistically analyzed data", outputSeg, self)

        self.sendButton.resetSettingsChangedFlag()
Exemplo n.º 9
0
    def sendData(self):
        """Send data from website springfieldspringfield"""

        # Skip if title list is empty:
        if self.myBasket == list():
            self.infoBox.setText(
                "Your corpus is empty, please add some movies first",
                "warning")
            self.segmentation = None
            self.send("Movie Scripts importation", self.segmentation, self)

        # Clear created Inputs.
        self.clearCreatedInputs()

        annotations = list()
        script_list = list()
        annotations_dict = dict()
        self.controlArea.setDisabled(True)

        # Initialize progress bar.
        progressBar = ProgressBar(self, iterations=len(self.myBasket))

        # This part of code is what fetches the actual script
        try:
            for movie in self.myBasket:
                # Each movie that is in the corpus is split into title and year
                # (rsplit makes sure to only split last occurence) which will become annotations
                b = copy.copy(movie)
                future_annotation = b.rsplit('(', 1)
                movie_title = future_annotation[0]
                movie_year = future_annotation[-1]
                movie_year = movie_year[:-1]
                annotations_dict["Movie Title"] = movie_title
                annotations_dict["Year of release"] = movie_year
                # It is important to make a copy of dictionary, otherwise each iteration
                # will replace every element of the annotations list
                annotations.append(annotations_dict.copy())
                # link_end and page_url are the two variables that will have to be changed
                # in case scripts need to be taken from elsewhere
                link_end = self.path_storage[movie]
                page_url = "https://www.springfieldspringfield.co.uk/movie_script.php?movie=" + link_end
                page = urllib.request.urlopen(page_url)
                soup = BeautifulSoup(page, 'html.parser')

                # This is what grabs the movie script
                script = soup.find("div", {"class": "movie_script"})

                script_list.append(script.text)

                # 1 tick on the progress bar of the widget
                progressBar.advance()

        except:
            self.infoBox.setText(
                "Couldn't download data from SpringfieldSpringfield website.",
                "error")
            self.controlArea.setDisabled(False)
            return

        # Store downloaded script strings in input objects...
        for script in script_list:
            newInput = Input(script, self.captionTitle)
            self.createdInputs.append(newInput)

    # If there's only one play, the widget"s output is the created Input.
        if len(self.createdInputs) == 1:
            self.segmentation = self.createdInputs[0]

        # Otherwise the widget"s output is a concatenation...
        else:
            self.segmentation = Segmenter.concatenate(
                self.createdInputs,
                self.captionTitle,
                import_labels_as=None,
            )

        # Annotate segments...
        for idx, segment in enumerate(self.segmentation):
            segment.annotations.update(annotations[idx])
            self.segmentation[idx] = segment

        # Clear progress bar.
        progressBar.finish()

        self.controlArea.setDisabled(False)

        # Set status to OK and report data size...
        message = "%i segment@p sent to output " % len(self.segmentation)
        message = pluralize(message, len(self.segmentation))
        numChars = 0
        for segment in self.segmentation:
            segmentLength = len(Segmentation.get_data(segment.str_index))
            numChars += segmentLength
        message += "(%i character@p)." % numChars
        message = pluralize(message, numChars)
        self.infoBox.setText(message)

        self.send("Movie Scripts importation", self.segmentation, self)
        self.sendButton.resetSettingsChangedFlag()
    def send_data(self):
        """Compute result of widget processing and send to output"""

        # Check that there's a table in input...
        if self.inputTable is None:
            self.infoBox.setText(
                "Widget needs input.", 
                "warning"
            )
            self.send("Term-topic Textable table", None)
            self.send("Document-topic Textable table", None)
            self.send("Term-topic Orange table", None)
            self.send("Document-topic Orange table", None)
            self.listEntries = list()
            return

        # Initialize progress bar.
        self.controlArea.setDisabled(True)
        progressBar = ProgressBar(
            self, 
            iterations=1    # TODO
        )       
                
        # Convert input table to gensim dictionary.  
        dictionary, corpus = pivot_crosstab_to_gensim(self.inputTable)
        
        # Apply topic modelling...
        
        # Case 1: LDA...
        if self.method == "Latent Dirichlet allocation":
            
            model = models.LdaModel(
                corpus, 
                id2word=dictionary, 
                num_topics=self.numTopics,
            )
            
            # Create segment-topic PivotCrosstab table.
            values = dict()
            terms = list()
            for topic in range(self.numTopics):
                topic_terms = model.get_topic_terms(
                    topic, 
                    len(self.inputTable.col_ids),
                )
                for term, score in topic_terms:
                    values[(dictionary[term], topic)] = score
                terms.append(
                    list(
                        dictionary[t] 
                        for t, s in topic_terms[:MAX_NUM_DISPLAYED_TERMS]
                    )
                )
            segmentTopicTable = PivotCrosstab(
                row_ids=self.inputTable.col_ids[:],
                col_ids=list(range(self.numTopics)),
                values=values,
                header_row_id='__topic__',
                header_row_type='continuous',
                header_col_id='__unit__',
                header_col_type='string',
                col_type=dict(
                    (col_id, 'continuous') for col_id in range(self.numTopics)
                ),
            )

            # Fill listbox...
            newListEntries = list()
            for topicNum in range(self.numTopics):
                displayedTerms = ", ".join(terms[topicNum])
                if len(self.inputTable.col_ids) > MAX_NUM_DISPLAYED_TERMS:
                    displayedTerms += ", ..."
                listEntry = "%i. %s" % (
                    topicNum+1,
                    displayedTerms,
                )
                newListEntries.append(listEntry)
            self.listEntries = newListEntries

            # Create context-topic PivotCrosstab table...
            corpus_lda = model[corpus]
            values = dict()
            for row_idx, row in enumerate(self.inputTable.row_ids):
                lda_doc = corpus_lda[row_idx]
                for topic, score in lda_doc:
                    values[(row, topic)] = score 
            contextTopicTable = PivotCrosstab(
                row_ids=self.inputTable.row_ids[:],
                col_ids=list(range(self.numTopics)),
                values=values,
                header_row_id='__topic__',
                header_row_type='continuous',
                header_col_id='__context__',
                header_col_type='string',
                col_type=dict(
                    (col_id, 'continuous') for col_id in range(self.numTopics)
                ),
                missing=0,
            )  
                
        # Case 2: LSI...
        if self.method == "Latent semantic indexing":
            
            model = models.LsiModel(
                corpus, 
                id2word=dictionary, 
                num_topics=self.numTopics,
            )
            
            # Create segment-topic PivotCrosstab table.
            segmentTopicTable = PivotCrosstab.from_numpy(
                row_ids=self.inputTable.col_ids[:],
                col_ids=list(range(self.numTopics)),
                np_array=model.projection.u,
                header_row_id='__topic__',
                header_row_type='continuous',
                header_col_id='__unit__',
                header_col_type='string',
                col_type=dict(
                    (col_id, 'continuous') for col_id in range(self.numTopics)
                ),
            )

            # Fill listbox...
            colIds = np.array(self.inputTable.col_ids)
            newListEntries = list()
            # Subtask: compute total inertia, i.e. sum of eigenvalues of
            # doc-term matrix multiplied by its transposed...
            rect_matrix = self.inputTable.to_numpy()
            matrix_dims = self.inputTable.to_numpy().shape
            if matrix_dims[0] > matrix_dims[1]:
                square_matrix = np.dot(np.transpose(rect_matrix), rect_matrix)
            else:
                square_matrix = np.dot(rect_matrix, np.transpose(rect_matrix))
            total_inertia = sum(np.linalg.eigvals(square_matrix))
            for topicNum in range(self.numTopics):
                # Proportion of inertia is SQUARE of singular value divided by
                # total inertia, because n-th singular value = square root of
                # n-th eigenvalue (cf. compute total inertia above)...
                propInertia = model.projection.s[topicNum] ** 2 / total_inertia 
                scores = model.projection.u[:,topicNum]
                sortedTerms = colIds[scores.argsort()[::-1]]
                if len(colIds) > MAX_NUM_DISPLAYED_TERMS:
                    displayedTerms = ", ".join(
                        sortedTerms[:MAX_NUM_DISPLAYED_TERMS//2]
                    )
                    displayedTerms += ", ..., "
                    displayedTerms += ", ".join(
                        sortedTerms[-MAX_NUM_DISPLAYED_TERMS//2:]
                    )
                else:
                    displayedTerms = ", ".join(sortedTerms)
                listEntry = "%i. (%.2f%%) %s" % (
                    topicNum+1,
                    propInertia*100, 
                    displayedTerms,
                )
                newListEntries.append(listEntry)
            self.listEntries = newListEntries

            # Create context-topic PivotCrosstab table...
            contextTopicMatrix = corpus2dense(
                model[corpus], len(model.projection.s)
            ).T / model.projection.s
            values = dict()
            for row_idx, row in enumerate(contextTopicMatrix):
                for topic, val in enumerate(row):
                    values[(self.inputTable.row_ids[row_idx], topic)] = val
            contextTopicTable = PivotCrosstab(
                row_ids=self.inputTable.row_ids[:],
                col_ids=list(range(self.numTopics)),
                values=values,
                header_row_id='__topic__',
                header_row_type='continuous',
                header_col_id='__context__',
                header_col_type='string',
                col_type=dict(
                    (col_id, 'continuous') for col_id in range(self.numTopics)
                ),
                missing=0,
            )            
                
        # Case 2: Correspondence analysis...
        elif self.method == "Correspondence analysis":
        
            ca = correspondence(self.inputTable.to_numpy())

            # Create segment-topic PivotCrosstab table.
            segmentTopicTable = PivotCrosstab.from_numpy(
                row_ids=self.inputTable.col_ids[:],
                col_ids=list(range(self.numTopics)),
                np_array=ca.col_factors[:, range(self.numTopics)],
                header_row_id='__topic__',
                header_row_type='continuous',
                header_col_id='__unit__',
                header_col_type='string',
                col_type=dict(
                    (col_id, 'continuous') for col_id in range(self.numTopics)
                ),
            )

            # Fill listbox...
            colIds = np.array(self.inputTable.col_ids)
            newListEntries = list()
            total_inertia = sum(ca.inertia_of_axis())
            for topicNum in range(self.numTopics):
                propInertia = ca.inertia_of_axis()[topicNum] / total_inertia
                scores = np.array(ca.col_factors[:,topicNum])
                sortedTerms = colIds[scores.argsort()[::-1]]
                if len(colIds) > MAX_NUM_DISPLAYED_TERMS:
                    displayedTerms = ", ".join(
                        sortedTerms[:MAX_NUM_DISPLAYED_TERMS//2]
                    )
                    displayedTerms += ", ..., "
                    displayedTerms += ", ".join(
                        sortedTerms[-MAX_NUM_DISPLAYED_TERMS//2:]
                    )
                else:
                    displayedTerms = ", ".join(sortedTerms)
                listEntry = "%i. (%.2f%%) %s" % (
                    topicNum+1,
                    propInertia*100,
                    displayedTerms,
                )
                newListEntries.append(listEntry)
            self.listEntries = newListEntries

            # Create context-topic PivotCrosstab table.
            contextTopicTable = PivotCrosstab.from_numpy(
                row_ids=self.inputTable.row_ids[:],
                col_ids=list(range(self.numTopics)),
                np_array=ca.row_factors[:, range(self.numTopics)],
                header_row_id='__topic__',
                header_row_type='continuous',
                header_col_id='__unit__',
                header_col_type='string',
                col_type=dict(
                    (col_id, 'continuous') for col_id in range(self.numTopics)
                ),
            )
            
        # Set status to OK and report...
        self.infoBox.setText("Tables correctly sent to output.")

        # Clear progress bar.
        progressBar.finish()
        self.controlArea.setDisabled(False)        
        
        # Send tokens...
        self.send("Term-topic Textable table", segmentTopicTable)
        self.send("Document-topic Textable table", contextTopicTable)
        self.send(
            "Term-topic Orange table", 
            segmentTopicTable.to_orange_table(),
            )
        self.send(
            "Document-topic Orange table", 
            contextTopicTable.to_orange_table(),
        )
        
        self.sendButton.resetSettingsChangedFlag()        
 def updateCharacterList(self):
     """Update character list based on Charnet output."""
     if self.mustLoad:
         self.loadModel()
     self.controlArea.setDisabled(True)
     progressBar = ProgressBar(self, iterations=4)
     string = " ".join(segment.get_content() for segment in self.inputSeg)
     progressBar.advance()
     self.char_df = charnet.extract_spacy_df(string, self.nlp) # TODO progress
     progressBar.advance()
     self.char_df = charnet.unify_tags(self.char_df)
     progressBar.advance()
     self.char_list = charnet.concatenate_parents(self.char_df, min_occ = 1)
     self.characters = [", ".join(char) for char in self.char_list]
     progressBar.advance()
     progressBar.finish()
     self.controlArea.setDisabled(False)
Exemplo n.º 12
0
class Treetagger(OWTextableBaseWidget):
    """Orange widget for POS-tagging and lemmatization with Treetagger"""

    name = "Treetagger"
    description = "POS-tagging and lemmatization with Treetagger"
    icon = "icons/treetagger.svg"
    priority = 2003

    inputs = [("Segmentation", Segmentation, "inputData")]
    outputs = [("Tagged data", Segmentation)]

    settingsHandler = VersionedSettingsHandler(
        version=__version__.rsplit(".", 1)[0]
    )

    language = settings.Setting(0)
    replaceUnknown = settings.Setting(False)
    outputFormat = settings.Setting("segment into words")

    want_main_area = False

    configFilePath = os.path.normpath(
        appdirs.user_data_dir("textable", "langtech") + "/treetagger_path"
    )

    def __init__(self, *args, **kwargs):
        """Initialize a Message widget"""
        super().__init__(*args, **kwargs)

        # Other attributes...
        self.segmentation = None
        self.createdInputs = list()
        self.noLanguageParameterWarning = (
            "Please make sure that at least one language parameter "
            "file is installed in your Treetagger 'lib' directory, "
            "then click 'Reload language parameter files'."
        )
        self.noTreetaggerPathWarning = (
            "Please click 'Locate Treetagger' below and select the "
            "base directory of a valid Treetagger distribution."
        )
        self.TreetaggerPath = (
            treetaggerwrapper.locate_treetagger() or
            self.lookupSavedTreetaggerPath()
        )

        self.infoBox = InfoBox(widget=self.controlArea)

        self.sendButton = SendButton(
            widget=self.controlArea,
            master=self,
            callback=self.sendData,
            infoBoxAttribute=u"infoBox",
            sendIfPreCallback=self.updateGUI
        )

        gui.separator(self.controlArea, height=3)

        self.optionsBox = gui.widgetBox(
            self.controlArea,
            u"Options",
        )

        self.languageCombobox = gui.comboBox(
            widget=self.optionsBox,
            master=self,
            value="language",
            items=list(),
            sendSelectedValue=True,
            orientation=u"horizontal",
            label="Input language:",
            labelWidth=180,
            callback=self.sendButton.settingsChanged,
            tooltip=(
                u"Select the language of the input text."
            ),
        )
        self.languageCombobox.setMinimumWidth(120)

        gui.separator(self.optionsBox, height=3)

        gui.comboBox(
            widget=self.optionsBox,
            master=self,
            value="outputFormat",
            items=[
                "segment into words",
                "add XML tags",
            ],
            sendSelectedValue=True,
            orientation=u"horizontal",
            label="Output format:",
            labelWidth=180,
            callback=self.sendButton.settingsChanged,
            tooltip=(
                u"Select the format of the output:\n\n"
                u"Segment into words: each word is in a separate segment,\n"
                u"with lemma and POS-tag as annotations.\n\n"
                u"Add XML tags: output segments correspond to input segments\n"
                u"and each word is tagged in XML as a 'w' element with\n"
                u"lemma and POS-tag as attributes."
            ),
        )

        gui.separator(self.optionsBox, height=3)

        gui.checkBox(
            widget=self.optionsBox,
            master=self,
            value="replaceUnknown",
            label="Output token in place of [unknown] lemmas",
            callback=self.sendButton.settingsChanged,
            tooltip=(
                u"For out-of-vocabulary words, the word form is used as the\n"
                u"lemma (in place of Treetagger's default 'unknown' code)."
            ),
        )

        gui.rubber(self.controlArea)

        self.sendButton.draw()
        self.infoBox.draw()

        self.locateTreetaggerBox=gui.widgetBox(
            self.controlArea,
            addSpace=False,
        )

        gui.separator(self.locateTreetaggerBox, height=3)

        self.treetaggerButton = gui.button(
            widget=self.locateTreetaggerBox,
            master=self,
            label="Locate Treetagger",
            callback=self.validateTreetagger,
            tooltip=(
                u"Click to select the location of the Treetagger base\n"
                u"directory (containing the 'lib' and 'bin' subdirectories)."
            ),
        )

        self.sendButton.sendIf()

        self.adjustSizeWithTimer()

    def inputData(self, inputData):
        """Process incoming data."""
        self.segmentation = inputData
        self.infoBox.inputChanged()
        self.sendButton.sendIf()

    def sendData(self):

        # Clear created Inputs...
        self.clearCreatedInputs()

        if not self.TreetaggerPath:
            self.infoBox.setText(self.noTreetaggerPathWarning, "warning")
            self.send("Tagged data", None)
            return
        elif not self.getAvailableLanguages():
            self.infoBox.setText(self.noLanguageParameterWarning, "warning")
            self.send("Tagged data", None)
            return
        elif not self.segmentation:
            self.infoBox.setText(
                u"Widget needs input",
                "warning"
            )
            self.send("Tagged data", None)
            return

        # Initialize progress bar.
        self.infoBox.setText(u"Processing, please wait...", "warning")
        self.controlArea.setDisabled(True)
        self.progressBar = ProgressBar(
            self,
            iterations = 5
        )

        # Create a copy of input seg, storing annotations in temp attr...
        copy_of_input_seg = Segmentation()
        copy_of_input_seg.label = self.segmentation.label
        for seg_idx, segment in enumerate(self.segmentation):
            attr = " ".join(
                ["%s='%s'" % item for item in segment.annotations.items()]
            )
            segment.annotations["tt_ax"] = attr
            copy_of_input_seg.append(segment)

        self.progressBar.advance()

        # Dump segmentation in unique string to avoid multiple calls to TT...
        concatenated_text = copy_of_input_seg.to_string(
            formatting="<ax_tt %(tt_ax)s>%(__content__)s</ax_tt>",
            display_all=True,
        )

        self.progressBar.advance()

        # Tag the segmentation contents...
        tagopt = '-token -lemma -sgml -quiet'
        if self.replaceUnknown:
            tagopt += " -no-unknown"
        tagger = treetaggerwrapper.TreeTagger(
            TAGLANG=pycountry.languages.get(name=self.language).alpha_2,
            TAGOPT=tagopt,
            TAGDIR=self.TreetaggerPath,
        )
        tagged_lines = tagger.tag_text(
            concatenated_text,
            notagurl=True,
            notagemail=True,
            notagip=True,
            notagdns=True,
        )
        tagged_input = Input("\n".join(tagged_lines))
        self.createdInputs.append(tagged_input)

        # Re-segment to match the original segmentation structure.
        tagged_segmentation = Segmenter.import_xml(tagged_input, "ax_tt")

        self.progressBar.advance()

        # Replace <unknown> with [unknown], " with &quot; and place
        # each output line of Treetagger in an xml tag with annotations...
        xml_segmentation, _ = Segmenter.recode(
            tagged_segmentation,
            substitutions = [
                (re.compile(r"<unknown>"), "[unknown]"),
                (re.compile(
                    r"(.+)\t(.+)\t(.+?)(?=[\r\n])"),
                    '<w lemma="&3" pos-tag="&2">&1</w>'
                ),
                (re.compile(r'"""'), '"&quot;"'),
                (re.compile(r'^\n|\n$'), ''),
            ],
        )
        # Segment into individual tokens if XML output option is disabled...
        if self.outputFormat == "add XML tags":
            output_segmentation = xml_segmentation
        else:
            try:
                output_segmentation = Segmenter.import_xml(
                    xml_segmentation,
                    "w"
                )
            except ValueError:
                self.infoBox.setText(
                    "Please check that either the input contains well-formed "
                    "XML, or it doesn't contain instances of '&#60;' and '\x3e'",
                    "error"
                )
                self.send("Tagged data", None)
                self.progressBar.finish()
                self.controlArea.setDisabled(False)
                return

        self.progressBar.finish()
        self.controlArea.setDisabled(False)

        output_segmentation.label = self.captionTitle
        message = u'%i segment@p sent to output.' % len(output_segmentation)
        message = pluralize(message, len(output_segmentation))
        self.infoBox.setText(message)
        self.send('Tagged data', output_segmentation, self)
        self.sendButton.resetSettingsChangedFlag()

    def updateGUI(self):
        """Update GUI state"""
        if self.TreetaggerPath:
            self.optionsBox.setDisabled(False)
            self.locateTreetaggerBox.setVisible(False)
            self.languageCombobox.clear()
            languages = self.getAvailableLanguages()
            if not languages:
                self.infoBox.setText(self.noLanguageParameterWarning, "warning")
                self.optionsBox.setDisabled(True)
                self.locateTreetaggerBox.setVisible(True)
                self.treetaggerButton.setText("Reload language parameter files")
            else:
                self.language = self.language or languages[0]
        else:
            self.infoBox.setText(self.noTreetaggerPathWarning, "warning")
            self.optionsBox.setDisabled(True)
            self.locateTreetaggerBox.setVisible(True)
        self.adjustSizeWithTimer()

    def getAvailableLanguages(self):
        languages = list()
        for lang_code in sorted(treetaggerwrapper.g_langsupport):
            if lang_code.startswith("__"):
                continue
            try:
                treetaggerwrapper.TreeTagger(
                    TAGLANG=lang_code,
                    TAGDIR=self.TreetaggerPath,
                )
                language = pycountry.languages.get(alpha_2=lang_code).name
                self.languageCombobox.addItem(language)
                languages.append(language)
            except:
                pass
        return languages

    def lookupSavedTreetaggerPath(self):
        """Look for a saved Treetagger base dir path in app data"""
        if os.path.exists(self.__class__.configFilePath):
            try:
                inputFile = open(self.__class__.configFilePath, "r")
                TreetaggerSavedPath = inputFile.read()
                inputFile.close()
                if self.checkTreetaggerPath(TreetaggerSavedPath):
                    return TreetaggerSavedPath
                else:
                    os.remove(self.__class__.configFilePath)
                    return None
            except IOError:
                pass

    def validateTreetagger(self):
        """Respond to user actions needed to validate Treetagger path"""

        # If the Treetagger path is known, make sure there are language files...
        if self.TreetaggerPath:
            if self.getAvailableLanguages():
                self.sendButton.settingsChanged()
                self.updateGUI()
            else:
                QMessageBox.warning(
                    None,
                    'Textable',
                    'Language parameter files not found.',
                    QMessageBox.Ok
                )
            return

        # Else if the path is not known...

        # First try to locate it automatically...
        TreetaggerPath = treetaggerwrapper.locate_treetagger()

        # If it fails, let the user locate it manually...
        if not (TreetaggerPath and self.checkTreetaggerPath(TreetaggerPath)):

            TreetaggerManualPath = os.path.normpath(
                str(
                    QFileDialog.getExistingDirectory(
                        self, u"Please locate Treetagger base directory"
                    )
                )
            )

            # If user selected a dir...
            if TreetaggerManualPath:

                # Check if selected dir contains Treetagger binary...
                if self.checkTreetaggerPath(TreetaggerManualPath):
                    TreetaggerPath = TreetaggerManualPath
                else:
                    QMessageBox.warning(
                        None,
                        'Textable',
                        'Not a valid Treetagger base directory.',
                        QMessageBox.Ok
                    )

        # If a valid path was found somehow, save config to app data...
        if TreetaggerPath:
            try:
                user_data_editor_dir = os.path.normpath(
                    self.__class__.configFilePath + "/../.."
                )
                if not os.path.exists(user_data_editor_dir):
                    os.makedirs(user_data_editor_dir)
                user_data_software_dir = os.path.normpath(
                    self.__class__.configFilePath + "/.."
                )
                if not os.path.exists(user_data_software_dir):
                    os.makedirs(user_data_software_dir)
                outputFile = open(self.__class__.configFilePath, "w")
                outputFile.write(TreetaggerPath)
                outputFile.close()
            except IOError:
                pass
            self.TreetaggerPath = TreetaggerPath

            self.sendButton.settingsChanged()

    def checkTreetaggerPath(self, path):
        """Check if path is a valid Treetagger base dir"""
        return os.path.exists(
            os.path.normpath(
                path + "/bin/tree-tagger" + (".exe" if os.name == "nt" else "")
            )
        )

    def clearCreatedInputs(self):
        for i in self.createdInputs:
            Segmentation.set_data(i[0].str_index, None)
        del self.createdInputs[:]

    def onDeleteWidget(self):
        """Free memory when widget is deleted (overriden method)"""
        self.clearCreatedInputs()

    def setCaption(self, title):
        if 'captionTitle' in dir(self):
            changed = title != self.captionTitle
            super().setCaption(title)
            if changed:
                self.sendButton.settingsChanged()
        else:
            super().setCaption(title)
Exemplo n.º 13
0
    def sendData(self):

        # Clear created Inputs...
        self.clearCreatedInputs()

        if not self.TreetaggerPath:
            self.infoBox.setText(self.noTreetaggerPathWarning, "warning")
            self.send("Tagged data", None)
            return
        elif not self.getAvailableLanguages():
            self.infoBox.setText(self.noLanguageParameterWarning, "warning")
            self.send("Tagged data", None)
            return
        elif not self.segmentation:
            self.infoBox.setText(
                u"Widget needs input",
                "warning"
            )
            self.send("Tagged data", None)
            return

        # Initialize progress bar.
        self.infoBox.setText(u"Processing, please wait...", "warning")
        self.controlArea.setDisabled(True)
        self.progressBar = ProgressBar(
            self,
            iterations = 5
        )

        # Create a copy of input seg, storing annotations in temp attr...
        copy_of_input_seg = Segmentation()
        copy_of_input_seg.label = self.segmentation.label
        for seg_idx, segment in enumerate(self.segmentation):
            attr = " ".join(
                ["%s='%s'" % item for item in segment.annotations.items()]
            )
            segment.annotations["tt_ax"] = attr
            copy_of_input_seg.append(segment)

        self.progressBar.advance()

        # Dump segmentation in unique string to avoid multiple calls to TT...
        concatenated_text = copy_of_input_seg.to_string(
            formatting="<ax_tt %(tt_ax)s>%(__content__)s</ax_tt>",
            display_all=True,
        )

        self.progressBar.advance()

        # Tag the segmentation contents...
        tagopt = '-token -lemma -sgml -quiet'
        if self.replaceUnknown:
            tagopt += " -no-unknown"
        tagger = treetaggerwrapper.TreeTagger(
            TAGLANG=pycountry.languages.get(name=self.language).alpha_2,
            TAGOPT=tagopt,
            TAGDIR=self.TreetaggerPath,
        )
        tagged_lines = tagger.tag_text(
            concatenated_text,
            notagurl=True,
            notagemail=True,
            notagip=True,
            notagdns=True,
        )
        tagged_input = Input("\n".join(tagged_lines))
        self.createdInputs.append(tagged_input)

        # Re-segment to match the original segmentation structure.
        tagged_segmentation = Segmenter.import_xml(tagged_input, "ax_tt")

        self.progressBar.advance()

        # Replace <unknown> with [unknown], " with &quot; and place
        # each output line of Treetagger in an xml tag with annotations...
        xml_segmentation, _ = Segmenter.recode(
            tagged_segmentation,
            substitutions = [
                (re.compile(r"<unknown>"), "[unknown]"),
                (re.compile(
                    r"(.+)\t(.+)\t(.+?)(?=[\r\n])"),
                    '<w lemma="&3" pos-tag="&2">&1</w>'
                ),
                (re.compile(r'"""'), '"&quot;"'),
                (re.compile(r'^\n|\n$'), ''),
            ],
        )
        # Segment into individual tokens if XML output option is disabled...
        if self.outputFormat == "add XML tags":
            output_segmentation = xml_segmentation
        else:
            try:
                output_segmentation = Segmenter.import_xml(
                    xml_segmentation,
                    "w"
                )
            except ValueError:
                self.infoBox.setText(
                    "Please check that either the input contains well-formed "
                    "XML, or it doesn't contain instances of '&#60;' and '\x3e'",
                    "error"
                )
                self.send("Tagged data", None)
                self.progressBar.finish()
                self.controlArea.setDisabled(False)
                return

        self.progressBar.finish()
        self.controlArea.setDisabled(False)

        output_segmentation.label = self.captionTitle
        message = u'%i segment@p sent to output.' % len(output_segmentation)
        message = pluralize(message, len(output_segmentation))
        self.infoBox.setText(message)
        self.send('Tagged data', output_segmentation, self)
        self.sendButton.resetSettingsChangedFlag()
    def searchMovies(self):
        """Search from imdb movie database"""
        result_list = {}
        query_string = self.newQuery

        if query_string != "":
            counter = 1
            counter_max = int(self.nbr_results)
            result_id = 0
            result_artist = []

            self.controlArea.setDisabled(True)

            # Initialize progress bar
            progressBar = ProgressBar(self, iterations=counter_max)

            ia = imdb.IMDb()

            # movie name
            name = query_string

            # searching the movie
            search = ia.search_movie(name)
            print(search)

            # Each result is stored in a dictionnary with its title
            # and year of publication if it is specified
            for result in search:
                if counter <= counter_max:
                    #print(counter)
                    #print(counter_max)
                    try:
                        result_id += 1
                        year = result['year']
                        movie_id = result.movieID
                        result_list[result_id] = {
                            'name': result,
                            'year': year,
                            'id': movie_id
                        }
                    except KeyError:
                        result_id += 1
                        result_list[result_id] = {
                            'name': result,
                        }

                    counter += 1
                else:
                    break

            # 1 tick on the progress bar of the widget
            progressBar.advance()
            # Stored the results list in the "result_list" variable
            self.searchResults = result_list

            # Reset and clear the visible widget list
            del self.titleLabels[:]

            # Update the results list with the search results
            # in order to display them
            for idx in self.searchResults:
                try:
                    result_string = f'{self.searchResults[idx]["name"]} - {self.searchResults[idx]["year"]}'
                    self.titleLabels.append(result_string)
                except KeyError:
                    result_string = f'{self.searchResults[idx]["name"]}'
                    self.titleLabels.append(result_string)

            self.titleLabels = self.titleLabels
            self.clearButton.setDisabled(False)
            self.addButton.setDisabled(False)

            # Clear progress bar.
            progressBar.finish()
            self.controlArea.setDisabled(False)

        else:
            self.infoBox.setText("Please enter a movie title", "warning")
Exemplo n.º 15
0
    def refreshDatabaseCache(self):
        """Refresh the database cache"""
        basepath = os.path.dirname(
            os.path.abspath(inspect.getfile(inspect.currentframe())))
        cacheFoldername = self.__class__.cachedFoldername
        if os.path.exists(cacheFoldername) and list(os.walk('.'))[0]:
            dialog = AnyQt.QtGui.QMessageBox()
            response = dialog.question(self, "CHILDES",
                                       "Keep previously saved files?",
                                       dialog.Yes | dialog.No)
        self.infoBox.setText(
            "Connecting to CHILDES website, please wait...",
            "warning",
        )
        progressBar = ProgressBar(self, iterations=1)
        self.controlArea.setDisabled(True)

        # Scrape website...
        self.database = dict()
        self.importedCorpora = list()
        try:
            self.recursivelyScrapeUrl(
                self.__class__.baseUrl,
                self.database,
            )
            # Dump cache to file...
            path = os.path.dirname(
                os.path.abspath(inspect.getfile(inspect.currentframe())))
            try:
                file = open(
                    os.path.join(path, self.__class__.cacheFilename),
                    "wb",
                )
                pickle.dump(self.database, file)
                file.close()
            except IOError:
                self.infoBox.setText(
                    "Couldn't save database to disk.",
                    "warning",
                )
            self.sendButton.settingsChanged()
        except requests.exceptions.ConnectionError:
            self.infoBox.setText(
                "Error while attempting to scrape the CHILDES website.",
                "error",
            )
            self.send("Files", None, self)
            self.send("Utterances", None, self)

        # Remove saved files if required...
        try:
            if response == dialog.No:
                shutil.rmtree(cacheFoldername)
        except UnboundLocalError:
            pass
        progressBar.advance()
        progressBar.finish()
        self.currentFolder = self.__class__.baseUrl
        self.updateDisplayedFolders()
        self.updateSelection()
        self.controlArea.setDisabled(False)
    def sendData(self):
        """Compute result of widget processing and send to output"""
        # Skip if title list is empty:
        if self.myBasket == list():
            self.infoBox.setText(
                "Your corpus is empty, please add some movies first",
                "warning")
            return

        # Clear created Inputs.
        self.clearCreatedInputs()

        self.controlArea.setDisabled(True)

        # Initialize progress bar.
        progressBar = ProgressBar(self, iterations=len(self.myBasket))

        # Attempt to connect to Genius and retrieve lyrics...
        selectedSongs = list()
        list_review = list()
        annotations = list()
        try:
            for item in self.myBasket:
                ia = imdb.IMDb()
                movie = ia.get_movie_reviews(item['id'])
                list_review.append(movie)
                # 1 tick on the progress bar of the widget
                progressBar.advance()

        # If an error occurs (e.g. http error, or memory error)...
        except:
            # Set Info box and widget to "error" state.
            self.infoBox.setText("Couldn't download data from imdb", "error")
            self.controlArea.setDisabled(False)
            return

        # Store movie critics strings in input objects...
        for movie in list_review:
            #for key, value in movie.items():
            #try:
            data = movie.get('data', "")
            reviews_data = data.get('reviews')
            for review in reviews_data:
                reviews = review.get('content')
                newInput = Input(reviews)
                self.createdInputs.append(newInput)
                new_dict = review.copy()
                annotations.append(new_dict)
        """
            except:
                self.infoBox.setText(
                "The movie has no associated reviews",
                "warning"
            )
            self.controlArea.setDisabled(False)
            return
        """

        # If there's only one item, the widget's output is the created Input.
        if len(self.createdInputs) == 1:
            self.segmentation = self.createdInputs[0]

        # Otherwise the widget's output is a concatenation...
        else:
            self.segmentation = Segmenter.concatenate(
                self.createdInputs,
                import_labels_as=None,
            )

        # Annotate segments...

        for idx, segment in enumerate(self.segmentation):
            segment.annotations.update(annotations[idx])
            self.segmentation[idx] = segment

        # Clear progress bar.
        progressBar.finish()

        self.controlArea.setDisabled(False)

        # Set status to OK and report data size...
        message = f"{len(self.segmentation)} segment@p sent to output"
        message = pluralize(message, len(self.segmentation))
        numChars = 0
        for segment in self.segmentation:
            segmentLength = len(Segmentation.get_data(segment.str_index))
            numChars += segmentLength
        message += "(%i character@p)." % numChars
        message = pluralize(message, numChars)
        self.infoBox.setText(message)

        self.send('Segmentation', self.segmentation, self)
        self.sendButton.resetSettingsChangedFlag()
Exemplo n.º 17
0
    def sendData(self):
        """Compute result of widget processing and send to output."""

        # Check that there's a model...
        if not self.model:
            self.infoBox.setText(
                "Please download a language model first.",
                "warning",
            )
            self.tabs.setCurrentIndex(1)
            return

        # Check that there's an input...
        if self.inputSeg is None:
            self.infoBox.setText("Widget needs input.", "warning")
            for channel in [c.name for c in self.outputs]:
                self.send(channel, None, self)
            return

        # Check max length and adjust if needed...
        inputLength = sum(len(s.get_content()) for s in self.inputSeg)
        if self.maxLen != "no limit":
            maxNumChar = int(self.maxLen.split()[0]) * 1000000
            if inputLength > maxNumChar:
                self.infoBox.setText(
                    "Input exceeds max number of characters set by user.", 
                    "warning",
                )
                for channel in [c.name for c in self.outputs]:
                    self.send(channel, None, self)
                return
        else:
            if inputLength > self.nlp.max_length:
                maxNumChar = inputLength          
        
        # Load components if needed...
        disabled, enabled = self.getComponentStatus()
        if self.mustLoad or not(
            self.nlp and set(enabled) <= set(self.loadedComponents)
        ):
            self.loadModel()
        self.nlp.max_length = maxNumChar
        
        # Initialize progress bar.
        self.infoBox.setText(
            u"Processing, please wait...", 
            "warning",
        )
        self.controlArea.setDisabled(True)
        progressBar = ProgressBar(self, iterations=len(self.inputSeg))       

        tokenSegments = list()
        entitySegments = list()
        chunkSegments = list()
        sentenceSegments = list()
        
        # Process each input segment...
        for segment in self.inputSeg:
        
            # NLP analysis...
            disabled, _ = self.getComponentStatus()
            disabled = [c for c in disabled if c in set(self.loadedComponents)]
            with self.nlp.disable_pipes(*disabled):
                doc = self.nlp(segment.get_content())

            # Get token segments...
            tokenSegments.extend(spacyItemsToSegments(doc, segment))

            # Get named entity segments...
            if self.segmentEntities:
                entitySegments.extend(spacyItemsToSegments(doc.ents, segment))

            # Get noun chunk segments...
            if self.segmentChunks:
                chunkSegments.extend(
                    spacyItemsToSegments(doc.noun_chunks, segment), 
                )

            # Get sentences segments...
            if self.segmentSentences:
                sentenceSegments.extend(
                    spacyItemsToSegments(doc.sents, segment), 
                )

            progressBar.advance()

        # Build segmentations and send them to output...                   
        tokenSeg = Segmentation(tokenSegments, self.captionTitle + "_tokens")
        self.send("Tokenized text", tokenSeg, self)
        if self.segmentChunks:
            chunkSeg = Segmentation(
                chunkSegments, 
                self.captionTitle + "_chunks",
            )
            self.send("Noun chunks", chunkSeg, self)
        if self.segmentEntities:
            entitySeg = Segmentation(
                entitySegments, 
                self.captionTitle + "_entities",
            )
            self.send("Named entities", entitySeg, self)
        if self.segmentSentences:
            sentenceSeg = Segmentation(
                sentenceSegments, 
                self.captionTitle + "_sentences",
            )
            self.send("Sentences", sentenceSeg, self)

        # Set status to OK and report data size...
        message = "%i token@p" % len(tokenSeg)
        message = pluralize(message, len(tokenSeg))
        if self.segmentChunks:
            message += ", %i chunk@p" % len(chunkSeg)
            message = pluralize(message, len(chunkSeg))
        if self.segmentEntities:
            message += ", %i " % len(entitySeg)
            message += "entity" if len(entitySeg) == 1 else "entities"
        if self.segmentSentences:
            message += ", %i sentence@p" % len(sentenceSeg)
            message = pluralize(message, len(sentenceSeg))
        message += " sent to output."
        last_comma_idx = message.rfind(",")
        if last_comma_idx > -1:
            message = message[:last_comma_idx] + " and" +    \
                message[last_comma_idx+1:]
        self.infoBox.setText(message)
        
        # Clear progress bar.
        progressBar.finish()
        self.controlArea.setDisabled(False)
                
        self.sendButton.resetSettingsChangedFlag()             
    def sendData(self):
        """Compute result of widget processing and send to output"""

        # Skip if title list is empty:
        if self.titleLabels == list():
            return

        # Check that something has been selected...
        if len(self.selectedTitles) == 0:
            self.infoBox.setText("Please select one or more titles.",
                                 "warning")
            self.send("XML-TEI data", None, self)
            return

        # Clear created Inputs.
        self.clearCreatedInputs()

        # Initialize progress bar.
        self.controlArea.setDisabled(True)
        progressBar = ProgressBar(self, iterations=len(self.selectedTitles))

        # Attempt to connect to ECP and retrieve plays...
        xml_contents = list()
        annotations = list()
        try:
            for title in self.selectedTitles:
                doc_url = self.document_base_url +  \
                    self.filteredTitleSeg[title].annotations["url"]
                print(doc_url)
                url = re.sub(r"/([^/]+)\.shtml", r"/\1/\1.xml", doc_url)
                print(url)
                response = urllib.request.urlopen(url)
                xml_contents.append(response.read().decode('utf-8'))
                source_annotations = \
                self.filteredTitleSeg[title].annotations.copy()
                #source_annotations["url"] = source_annotations["href"]
                #del source_annotations["href"]
                annotations.append(source_annotations)
                progressBar.advance()  # 1 tick on the progress bar...

        # If an error occurs (e.g. http error, or memory error)...
        except:
            #Set Info box and widget to "error" state.
            self.infoBox.setText("Couldn't download data from ECP website.",
                                 "error")
            # Reset output channel.
            self.send("XML-TEI data", None, self)
            self.controlArea.setDisabled(False)
            return

        # Store downloaded XML in input objects...
        for xml_content_idx in range(len(xml_contents)):
            newInput = Input(xml_contents[xml_content_idx], self.captionTitle)
            self.createdInputs.append(newInput)

        # If there"s only one play, the widget"s output is the created Input.
        if len(self.createdInputs) == 1:
            self.segmentation = self.createdInputs[0]

        # Otherwise the widget"s output is a concatenation...
        else:
            self.segmentation = Segmenter.concatenate(
                self.createdInputs,
                self.captionTitle,
                import_labels_as=None,
            )

        # Annotate segments...
        for idx, segment in enumerate(self.segmentation):
            segment.annotations.update(annotations[idx])
            self.segmentation[idx] = segment

        # Store imported URLs as setting.
        self.importedURLs = [
            self.filteredTitleSeg[self.selectedTitles[0]].annotations["url"]
        ]

        # Set status to OK and report data size...
        message = "%i segment@p sent to output " % len(self.segmentation)
        message = pluralize(message, len(self.segmentation))
        numChars = 0
        for segment in self.segmentation:
            segmentLength = len(Segmentation.get_data(segment.str_index))
            numChars += segmentLength
        message += "(%i character@p)." % numChars
        message = pluralize(message, numChars)
        self.infoBox.setText(message)

        # Clear progress bar.
        progressBar.finish()
        self.controlArea.setDisabled(False)

        # Send token...
        self.send("XML-TEI data", self.segmentation, self)
        self.sendButton.resetSettingsChangedFlag()
    def sendData(self):
        """Load files, create and send segmentation"""

        # Check that there's something on input...
        if (
            (self.displayAdvancedSettings and not self.files) or
            not (self.file or self.displayAdvancedSettings)
        ):
            self.infoBox.setText(u'Please select input file.', 'warning')
            self.send('Text data', None, self)
            return

        # Check that autoNumberKey is not empty (if necessary)...
        if self.displayAdvancedSettings and self.autoNumber:
            if self.autoNumberKey:
                autoNumberKey = self.autoNumberKey
            else:
                self.infoBox.setText(
                    u'Please enter an annotation key for auto-numbering.',
                    'warning'
                )
                self.send('Text data', None, self)
                return
        else:
            autoNumberKey = None

        # Clear created Inputs...
        self.clearCreatedInputs()

        fileContents = list()
        annotations = list()
        counter = 1

        if self.displayAdvancedSettings:
            myFiles = self.files
        else:
            myFiles = [[
                self.file,
                self.encoding,
                u'',
                u'',
                u'',
                self.ocrLanguages,
                self.ocrForce
            ]]

        self.infoBox.setText(u"Processing, please wait...", "warning")
        self.controlArea.setDisabled(True)
        progressBar = ProgressBar(
            self,
            iterations=len(myFiles)
        )

        # Open and process each file successively...
        for myFile in myFiles:
            filePath = myFile[0]
            encoding = myFile[1]
            encoding = re.sub(r"[ ]\(.+", "", encoding)
            annotation_key = myFile[2]
            annotation_value = myFile[3]
            pdf_password = myFile[4] # SuperTextFiles
            ocr_languages = myFile[5] # SuperTextFiles
            ocr_force = myFile[6] # SuperTextFiles

            myFiletype = filetype.guess(myFile[0]) # SuperTextFiles

            # Try to open the file...
            self.error()
            # Start SuperTextFiles
            try:
                if myFiletype is None:
                    fileContent = self.extract_raw_text(filePath, encoding)

                elif myFiletype.extension == "pdf":
                    if ocr_force is True:
                        fileContent = self.get_pdf_content(filePath)
                    else:
                        if self.is_textual_pdf_file(filePath) is True:
                            fileContent = self.extract_text_from_pdf(filePath)
                        else:
                            fileContent = self.get_pdf_content(filePath)

                elif myFiletype.extension in IMG_FILETYPES:
                    fileContent = self.ocrize(filePath)
            # End SuperTextFiles

            except IOError:
                progressBar.finish()
                if len(myFiles) > 1:
                    message = u"Couldn't open file '%s'." % filePath
                else:
                    message = u"Couldn't open file."
                self.infoBox.setText(message, 'error')
                self.send('Text data', None, self)
                self.controlArea.setDisabled(False)
                return

            # Remove utf-8 BOM if necessary...
            if encoding == u'utf-8':
                fileContent = fileContent.lstrip(
                    codecs.BOM_UTF8.decode('utf-8')
                )

            # Normalize text (canonical decomposition then composition)...
            fileContent = normalize('NFC', fileContent)

            fileContents.append(fileContent)

            # Annotations...
            annotation = dict()
            if self.displayAdvancedSettings:
                if annotation_key and annotation_value:
                    annotation[annotation_key] = annotation_value
                if self.importFilenames and self.importFilenamesKey:
                    filename = os.path.basename(filePath)
                    annotation[self.importFilenamesKey] = filename
                if self.autoNumber and self.autoNumberKey:
                    annotation[self.autoNumberKey] = counter
                    counter += 1
            annotations.append(annotation)
            progressBar.advance()

        # Create an LTTL.Input for each file...
        if len(fileContents) == 1:
            label = self.captionTitle
        else:
            label = None
        for index in range(len(fileContents)):
            myInput = Input(fileContents[index], label)
            segment = myInput[0]
            segment.annotations.update(annotations[index])
            myInput[0] = segment
            self.createdInputs.append(myInput)

        # If there's only one file, the widget's output is the created Input.
        if len(fileContents) == 1:
            self.segmentation = self.createdInputs[0]
        # Otherwise the widget's output is a concatenation...
        else:
            self.segmentation = Segmenter.concatenate(
                segmentations=self.createdInputs,
                label=self.captionTitle,
                copy_annotations=True,
                import_labels_as=None,
                sort=False,
                auto_number_as=None,
                merge_duplicates=False,
                progress_callback=None,
            )

        message = u'%i segment@p sent to output ' % len(self.segmentation)
        message = pluralize(message, len(self.segmentation))
        numChars = 0
        for segment in self.segmentation:
            segmentLength = len(Segmentation.get_data(segment.str_index))
            numChars += segmentLength
        message += u'(%i character@p).' % numChars
        message = pluralize(message, numChars)
        self.infoBox.setText(message)
        progressBar.finish()
        self.controlArea.setDisabled(False)

        self.send('Text data', self.segmentation, self)
        self.sendButton.resetSettingsChangedFlag()
Exemplo n.º 20
0
    def sendData(self):
        """Compute result of widget processing and send to output"""
        # Skip if title list is empty:
        if self.myBasket == list():
            self.infoBox.setText(
                "Your corpus is empty, please add some books first", "warning")
            return

        # Clear created Inputs.
        self.clearCreatedInputs()

        self.controlArea.setDisabled(True)

        # Initialize progress bar.
        progressBar = ProgressBar(
            self,
            iterations=len(self.myBasket),
        )

        text_content = list()
        annotations = list()

        try:
            # Retrieve selected texts from gutenberg
            for text in self.myBasket:

                gutenberg_id = text[2]

                # Get the text with Gutenbergpy
                gutenberg_text = gutenbergpy.textget.strip_headers(
                    gutenbergpy.textget.get_text_by_id(gutenberg_id)).decode(
                        "utf-8")

                text_content.append(gutenberg_text)
                # populate the annotation list
                annotations.append([text[0], text[1], text[3]])
                progressBar.advance()

        # If an error occurs (e.g. http error, or memory error)...
        except Exception as exc:
            # Set Info box and widget to "error" state.
            self.infoBox.setText("Couldn't download data from Gutenberg",
                                 "error")
            self.controlArea.setDisabled(False)
            print(exc)
            return

        # Store downloaded text strings in input objects...
        for text in text_content:
            newInput = Input(text, self.captionTitle)
            self.createdInputs.append(newInput)

        # If there's only one text, the widget's output is the created Input.
        if len(self.createdInputs) == 1:
            self.segmentation = self.createdInputs[0]

        # Otherwise the widget"s output is a concatenation.
        else:
            self.segmentation = Segmenter.concatenate(
                self.createdInputs,
                self.captionTitle,
                import_labels_as=None,
            )

        # Annotate segments with book metadata
        for idx, segment in enumerate(self.segmentation):
            segment.annotations.update({"title": annotations[idx][0]})
            segment.annotations.update({"author": annotations[idx][1]})
            segment.annotations.update({"language": annotations[idx][2]})
            self.segmentation[idx] = segment

        # Clear progress bar.
        progressBar.finish()

        self.controlArea.setDisabled(False)

        # Set status to OK and report data size...
        message = "%i segment@p sent to output " % len(self.segmentation)
        message = pluralize(message, len(self.segmentation))
        numChars = 0
        for segment in self.segmentation:
            segmentLength = len(Segmentation.get_data(segment.str_index))
            numChars += segmentLength
        message += "(%i character@p)." % numChars
        message = pluralize(message, numChars)
        self.infoBox.setText(message)

        self.send("Gutenberg importation", self.segmentation, self)
        self.sendButton.resetSettingsChangedFlag()
Exemplo n.º 21
0
    def get_large_audio_transcription(self,
                                      path,
                                      language,
                                      set_silence_len=500,
                                      set_silence_threshold=14):
        """
        Splitting the large audio file into chunks
        and apply speech recognition on each of these chunks
        """
        # Create a temporary folder to handle the chunks, will be deleted upon completion of the task
        with tempfile.TemporaryDirectory() as tempDict:

            # Initialize the recognizer
            r = sr.Recognizer()

            if 'wav' not in self.file or 'mp3' not in self.file:
                return

            # Check type of the audio file and change it to wav if mp3
            audio_type = self.detect_format(path)

            if audio_type == "mp3":
                path = self.to_wav(path, tempDict)

            # Open the audio file using pydub
            sound = AudioSegment.from_wav(path)
            # Split audio sound where silence is 700 milliseconds or more and get chunks
            chunks = split_on_silence(
                sound,
                # Experiment with this value for your target audio file
                min_silence_len=set_silence_len,
                # Adjust this per requirement
                silence_thresh=sound.dBFS - set_silence_threshold,
                # Keep the silence for 1 second, adjustable as well
                keep_silence=500,
            )

            # Initiates ouput variables (depending on advanced settings)

            whole_text = ""
            segments = list()
            #Initiate alert message and progress bar
            progressBar = ProgressBar(self, iterations=len(chunks))

            # Process each chunk
            for i, audio_chunk in enumerate(chunks, start=1):
                # export audio chunk and save it in
                # the tempDict directory.
                chunk_filename = os.path.join(tempDict, f"chunk{i}.wav")
                audio_chunk.export(chunk_filename, format="wav")
                # recognize the chunk
                with sr.AudioFile(chunk_filename) as source:
                    audio_listened = r.record(source)
                    # Try converting it to text
                    try:
                        # Get the value of the chosen language in the dictionnary
                        text = r.recognize_google(
                            audio_listened,
                            language=AudioFile.dict_languages[self.language])
                    except sr.UnknownValueError as e:
                        print("Error : ", str(e))
                    else:
                        # Creates an entry of the list "segments" for each audio_chunk
                        if self.selected_seg:
                            segmented_text = f"{text.capitalize()}. "
                            print(chunk_filename, " : ", segmented_text)
                            segments.append(segmented_text)
                        # Add the segment to the segmentation
                        else:
                            # Returns transciprtion as whole_text
                            text = f"{text.capitalize()}. "
                            print(chunk_filename, " : ", text)
                            whole_text += text
                        self.infoBox.setText(u"Processing, please wait...",
                                             "warning")
                        progressBar.advance()
        # return the text for all chunks detected
        if self.selected_seg:
            progressBar.finish()
            return segments
        else:
            progressBar.finish()
            return whole_text