class Childes(OWTextableBaseWidget): """Textable widget for importing data in XML format from the CHILDES database (https://childes.talkbank.org/data-xml/). """ #---------------------------------------------------------------------- # Widget's metadata... name = "CHILDES" description = "Import XML data from the CHILDES database" icon = "icons/CHILDES.svg" priority = 12 #---------------------------------------------------------------------- # Channel definitions (NB: no input in this case)... inputs = [] outputs = [("XML data", Segmentation)] #---------------------------------------------------------------------- # Settings... settingsHandler = VersionedSettingsHandler( version=__version__.rsplit(".", 1)[0]) importedCorpus = settings.Setting(None) autoSend = settings.Setting(False) #---------------------------------------------------------------------- # Other class variables... base_url = "https://childes.talkbank.org/data-xml/" cache_filename = "cache_childes" want_main_area = False def __init__(self): """Widget creator.""" super().__init__() # Other (non-setting) attributes... self.segmentation = None self.displayedFolderLabels = list() self.currentFolder = self.__class__.base_url self.database = None self.selectedItems = list() # Next two instructions are helpers from TextableUtils. Corresponding # interface elements are declared here and actually drawn below (at # their position in the UI)... self.infoBox = InfoBox(widget=self.controlArea) self.sendButton = SendButton( widget=self.controlArea, master=self, callback=self.sendData, infoBoxAttribute="infoBox", ) # User interface... # Browse database box browseBox = gui.widgetBox( widget=self.controlArea, box="Browse database", orientation="vertical", addSpace=False, ) upwardNavBox = gui.widgetBox( widget=browseBox, box=False, orientation="horizontal", ) self.homeRefreshButton = gui.button( widget=upwardNavBox, master=self, label="Home", callback=self.homeRefreshPressed, tooltip="Return to database root.", # tooltip="Connect to CHILDES website and refresh corpus list.", ) self.backButton = gui.button( widget=upwardNavBox, master=self, label="Back", callback=self.backPressed, tooltip="View parent folder.", ) gui.separator(widget=browseBox, height=3) self.currentFolderLabel = gui.label( widget=browseBox, master=self, label="Current folder: /", tooltip="This is the currently displayed folder.", ) gui.separator(widget=browseBox, height=3) displayedFolderListbox = gui.listBox( widget=browseBox, master=self, value="selectedItems", labels="displayedFolderLabels", callback=self.corpusSelected, tooltip="Select an item to open or import.", ) displayedFolderListbox.setMinimumHeight(150) displayedFolderListbox.setSelectionMode(1) displayedFolderListbox.doubleClicked.connect(self.listBoxDoubleClicked) self.importedCorpusLabel = gui.label( widget=browseBox, master=self, label="No corpus imported yet.", tooltip="This is the currently imported corpus.", ) gui.separator(widget=browseBox, height=3) downwardNavBox = gui.widgetBox( widget=browseBox, box=False, orientation="horizontal", ) self.openButton = gui.button( widget=downwardNavBox, master=self, label="Open", callback=self.openPressed, tooltip="View selected folder's contents.", ) self.importButton = gui.button( widget=downwardNavBox, master=self, label="Import", callback=self.importPressed, tooltip="Import selected item's contents.", ) gui.separator(widget=browseBox, height=3) gui.rubber(self.controlArea) # Now Info box and Send button must be drawn... self.sendButton.draw() self.infoBox.draw() # This initialization step needs to be done after infoBox has been # drawn (because we may need to display an error message). self.loadDatabaseCache() # Send data if autoSend. self.sendButton.sendIf() self.setMinimumWidth(350) self.adjustSizeWithTimer() def sendData(self): """Compute result of widget processing and send to output""" if not self.importedCorpus: self.infoBox.setText("Please select a corpus to import.", "warning") self.send("XML data", None, self) return progressBar = ProgressBar(self, iterations=1) response = requests.get(self.importedCorpus) myZip = zipfile.ZipFile(io.BytesIO(response.content)) for file in myZip.infolist(): print(file.filename, len(myZip.read(file))) progressBar.advance() progressBar.finish() corpus = self.displayedFolderLabels[self.selectedItems[0]] self.importedCorpusLabel.setText("Corpus %s correctly imported." % corpus) self.infoBox.setText("All good!") self.sendButton.resetSettingsChangedFlag() def homeRefreshPressed(self): """Refresh database file tree""" self.currentFolder = self.__class__.base_url self.updateDisplayedFolders() def backPressed(self): """Display parent folder's contents""" self.currentFolder = "/".join(self.currentFolder[:-1].split("/")[:-1]) self.currentFolder += "/" self.updateDisplayedFolders() def corpusSelected(self): """Import selected corpus""" self.updateBrowseBoxButtons() def openPressed(self): """Display selected folder's contents""" self.currentFolder += self.displayedFolderLabels[self.selectedItems[0]] self.updateDisplayedFolders() def importPressed(self): """Import selected corpus""" # TODO: handle exceptions corpus = self.displayedFolderLabels[self.selectedItems[0]] self.importedCorpus = self.currentFolder + corpus self.importButton.setDisabled(True) self.importedCorpusLabel.setText("Corpus %s ready to import." % corpus) self.sendButton.settingsChanged() def listBoxDoubleClicked(self): """Reroute to 'openPressed' or 'importPressed' as needed""" if self.displayedFolderLabels[self.selectedItems[0]].endswith(".zip"): self.importPressed() else: self.openPressed() def loadDatabaseCache(self): """Load the cached database""" # Try to open saved file in this module"s directory... path = os.path.dirname( os.path.abspath(inspect.getfile(inspect.currentframe()))) try: file = open(os.path.join(path, self.__class__.cache_filename), "rb") self.database = pickle.load(file) file.close() # Else try to rebuild cache from CHILDES website... except IOError: self.database = self.rebuildCacheFromWebsite() self.currentFolder = self.__class__.base_url self.updateDisplayedFolders() def updateDisplayedFolders(self): """Refresh state of displayed folder listbox""" # Current folder label... currentFolder = self.currentFolder[len(self.__class__.base_url) - 1:] self.currentFolderLabel.setText("Current folder: " + currentFolder) # Populate listbox... folderContent = self.database[self.__class__.base_url] steps = currentFolder[:-1].split("/")[1:] for idx, _ in enumerate(steps): path = self.__class__.base_url + "/".join(steps[:idx + 1]) + "/" folderContent = folderContent[path] displayedFolderLabels = list() for item in folderContent.keys(): if item.endswith(".zip"): displayedFolderLabels.append(item) else: displayedFolderLabels.append(item.split("/")[-2] + "/") self.displayedFolderLabels = displayedFolderLabels # Imported corpus label... if self.importedCorpus: self.importedCorpusLabel.setText( "Corpus %s ready to import." % self.importedCorpus.split("/")[-1]) # Buttons. self.updateBrowseBoxButtons() def updateBrowseBoxButtons(self): """Refresh state of Browse box buttons""" currentFolder = self.currentFolder[len(self.__class__.base_url) - 1:] self.homeRefreshButton.setDisabled(currentFolder == "/") self.backButton.setDisabled(currentFolder == "/") self.openButton.setDisabled( len(self.selectedItems) == 0 or self.displayedFolderLabels[self.selectedItems[0]].endswith(".zip")) self.importButton.setDisabled( len(self.selectedItems) == 0 or self.displayedFolderLabels[self.selectedItems[0]].endswith("/")) # The following method need to be copied (without any change) in # every Textable widget... def setCaption(self, title): if 'captionTitle' in dir(self): changed = title != self.captionTitle super().setCaption(title) if changed: self.sendButton.settingsChanged() else: super().setCaption(title)
class MovieScripts(OWTextableBaseWidget): """Textable widget for importing movie scripts from the website IMSDB.com (https://www.imsdb.com) """ #---------------------------------------------------------------------- # Widget's metadata... name = "Movie Scripts" description = "Movie Script Importation" icon = "icons/Movie_Scripts.png" priority = 11 #---------------------------------------------------------------------- # Channel definitions... inputs = [] outputs = [("Movie Scripts importation", Segmentation)] #---------------------------------------------------------------------- # Layout parameters... want_main_area = False #---------------------------------------------------------------------- # Settings... settingsHandler = VersionedSettingsHandler( version=__version__.rsplit(".", 1)[0]) # Saved settings autoSend = settings.Setting(True) myBasket = settings.Setting([]) def __init__(self): """Widget creator.""" super().__init__() # ATTRIBUTS # searchFunction self.searchResults = None self.inputSeg = None # newQuery = attribut box lineEdit (search something) self.newQuery = '' self.nbr_results = 10 # Results box attributs self.titleLabels = list() self.selectedTitles = list() # selections box attributs self.myTitles = list() self.mytitleLabels = list() # stock all the inputs (scripts) in a list self.createdInputs = list() self.path_storage = list() # Next two instructions are helpers from TextableUtils. Corresponding # interface elements are declared here and actually drawn below (at # their position in the UI)... self.infoBox = InfoBox(widget=self.controlArea) #self.sendButton = SendButton( # widget=self.controlArea, # master=self, # callback=self.sendData, # infoBoxAttribute="infoBox", #) # User interface... # Create the working area queryBox = gui.widgetBox( widget=self.controlArea, box="Search movie", orientation="vertical", ) # Allows to enter specific text to the research # Uses "newQuery" attribute gui.lineEdit( widget=queryBox, master=self, value='newQuery', orientation='horizontal', label=u"Movie title: ", labelWidth=100, tooltip=("Enter a string"), ) # Allows to choose the wanted results numberp (10 by 10) queryNbr = gui.comboBox( widget=queryBox, master=self, value="nbr_results", items=[ "5", "10", "20", "30", "40", "50", ], sendSelectedValue=True, orientation="horizontal", label="Number of results: ", labelWidth=120, tooltip=("Please select the desired search.\n"), ) # Research button # Use "searchFunction" attibute self.searchButton = gui.button( widget=queryBox, master=self, label='Search', callback=self.searchFunction, tooltip='Search for the movie', ) # Reasearch button # Uses "searchFunction" attribute self.titleListbox = gui.listBox( widget=queryBox, master=self, value="selectedTitles", # setting (list) labels="titleLabels", # setting (list) callback=lambda: self.selectButton.setDisabled(self.selectedTitles == list()), tooltip="Select the movie you want to get the script of", ) self.titleListbox.setMinimumHeight(120) self.titleListbox.setSelectionMode(3) boxbutton = gui.widgetBox( widget=queryBox, box=False, orientation='horizontal', ) # select button # Uses "select" function self.selectButton = gui.button( widget=boxbutton, master=self, label="Select", callback=self.sendData, tooltip="Select", ) self.selectButton.setDisabled(True) # Clear button # Uses "clearResults" function self.clearButton = gui.button( widget=boxbutton, master=self, label="Clear results", callback=self.clearResults, tooltip="Clear results", ) self.clearButton.setDisabled(True) gui.separator(widget=queryBox, height=3) gui.rubber(self.controlArea) #---------------------------------------------------------------------- # Draw Info box and Send button # self.sendButton.draw() # self.searchButton.setDefault(True) self.infoBox.draw() # Send data if autoSend. # self.sendButton.sendIf() def searchFunction(self, testdict): #Search from the springfieldspringfield.co.uk result_list = {} query_string = self.newQuery testdict = { "Die Hard (1988)": "die-hard", "Watchmen (2009)": "watchmen", "Back to the Future (1985)": "back-to-the-future", "Die Hard 2 (1990)": "die-hard-2", } # Reset and clear the visible widget list del self.titleLabels[:] if query_string != "": searchResults = process.extractBests(query_string, testdict, limit=5, score_cutoff=70) for key, score, val in searchResults: self.titleLabels.append(val) self.path_storage.append(key) self.titleLabels = self.titleLabels self.clearButton.setDisabled(False) self.controlArea.setDisabled(False) self.infoBox.setText("Search complete") else: self.infoBox.setText("You didn't search anything", "warning") def clearResults(self): """Clear the results list""" del self.titleLabels[:] self.titleLabels = self.titleLabels self.clearButton.setDisabled(True) # Get all movie titles from www.springfieldspringfield.co.uk def get_all_titles(title_to_href): php_query_string = '/movie_script.php?movie=' http_query_string = 'https://www.springfieldspringfield.co.uk/movie_scripts.php?order=' # title_to_href = dict() for lettre in [ '0' ]: #, 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', #'N', 'O', 'P', 'K', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']: page_num = 1 while True: page_url = http_query_string + '%s&page=%i' % (lettre, page_num) page = urllib.request.urlopen(page_url) soup = BeautifulSoup(page, 'html.parser') script_links = soup.findAll( 'a', attrs={'class': re.compile("^script-list-item")}) if not script_links: break links = dict() for link in soup.findAll( 'a', attrs={'class': re.compile("^script-list-item")}): links[link.text] = link.get('href')[len(php_query_string):] title_to_href.update(links) print(page_num) page_num += 1 # print(title_to_href) # Create the final output with the script def sendData(self): """Send data from website springfieldspringfield""" #link_title = process.extractBests(self.selectedTitles, testdict, limit=1) # Clear created Inputs. self.clearCreatedInputs() link_end = self.path_storage[self.selectedTitles[0]] self.controlArea.setDisabled(True) try: page_url = "https://www.springfieldspringfield.co.uk/movie_script.php?movie=" + link_end page = urllib.request.urlopen(page_url) soup = BeautifulSoup(page, 'html.parser') script = soup.find("div", {"class": "movie_script"}) new_input = script.text self.createdInputs.append(new_input) self.segmentation = self.createdInputs[0] print(self.createdInputs[0]) del self.path_storage[:] self.infoBox.setText("Script downloaded!", ) except: self.infoBox.setText( "Couldn't download data from SpringfieldSpringfield website.", "error") self.controlArea.setDisabled(False) def clearCreatedInputs(self): """Delete all Input objects that have been created.""" del self.createdInputs[:] # The following method needs to be copied verbatim in # every Textable widget that sends a segmentation... def setCaption(self, title): if 'captionTitle' in dir(self): changed = title != self.captionTitle super().setCaption(title) if changed: self.SendButton.settingsChanged() else: super().setCaption(title)
class Gutenberg(OWTextableBaseWidget): """Textable widget for importing clean texts from Gutenberg (https://www.gutenberg.org/) """ #---------------------------------------------------------------------- # Widget's metadata... name = "Gutenberg" description = "Gutenberg caching and importation" icon = "icons/gutenberg.png" priority = 10 #---------------------------------------------------------------------- # Channel definitions... inputs = [] outputs = [("Gutenberg importation", Segmentation)] #---------------------------------------------------------------------- # Layout parameters... want_main_area = False #---------------------------------------------------------------------- # Settings... settingsHandler = VersionedSettingsHandler( version=__version__.rsplit(".", 1)[0]) # Saved settings autoSend = settings.Setting(False) myBasket = settings.Setting([]) def __init__(self): """Widget creator.""" super().__init__() # ATTRIBUTS # searchFunction self.searchResults = None self.inputSeg = None # newQuery = attribut box lineEdit (search something) self.titleQuery = '' self.authorQuery = '' self.langQuery = 'Any' self.nbr_results = 200 # Results box attributs self.titleLabels = list() self.selectedTitles = list() # selections box attributs self.myTitles = list() self.mytitleLabels = list() # stock all the inputs (texts) in a list self.createdInputs = list() # Next two instructions are helpers from TextableUtils. Corresponding # interface elements are declared here and actually drawn below (at # their position in the UI)... self.infoBox = InfoBox(widget=self.controlArea) self.sendButton = SendButton( widget=self.controlArea, master=self, callback=self.sendData, infoBoxAttribute="infoBox", ) #---------------------------------------------------------------------- # User interface... # Create the working area self.cacheGenerationButton = gui.button( widget=self.controlArea, master=self, label="Generate cache", callback=self.generate_cache, tooltip="Generate the gutenberg cache, this might take a while...", ) self.queryBox = gui.widgetBox( widget=self.controlArea, box="Search books", orientation="vertical", ) # Allows to enter specific text to the research # Uses "newQuery" attribut gui.lineEdit( widget=self.queryBox, master=self, value='titleQuery', orientation='horizontal', label=u"Title: ", labelWidth=120, tooltip=("Enter a string"), ) gui.lineEdit( widget=self.queryBox, master=self, value='authorQuery', orientation='horizontal', label=u"Author: ", labelWidth=120, tooltip=("Enter a string"), ) #ComboBox for selecting the text language queryLang = gui.comboBox( widget=self.queryBox, master=self, value='langQuery', items=[ "Any", "Afrikaans", "Aleut", "Arabic", "Arapaho", "Bodo", "Breton", "Bulgarian", "Caló", "Catalan", "Cebuano", "Chinese", "Czech", "Danish", "Dutch", "English", "Esperanto", "Estonian", "Farsi", "Finnish", "French", "Frisian", "Friulian", "Gaelic, Scottish", "Galician", "Gamilaraay", "German", "Greek", "Greek, Ancient", "Hebrew", "Hungarian", "Icelandic", "Iloko", "Interlingua", "Inuktitut", "Irish", "Italian", "Japanese", "Kashubian", "Khasi", "Korean", "Latin", "Lithuanian", "Maori", "Mayan Languages", "Middle English", "Nahuatl", "Napoletano-Calabrese", "Navajo", "North American Indian", "Norwegian", "Occitan", "Ojibwa", "Old English", "Polish", "Portuguese", "Romanian", "Russian", "Sanskrit", "Serbian", "Slovenian", "Spanish", "Swedish", "Tagabawa", "Tagalog", "Telugu", "Welsh", "Yiddish" ], sendSelectedValue=True, orientation="horizontal", label="Language", labelWidth=120, tooltip=("Please select the desired language.\n"), ) #dict to get the language code self.lang_dict = { "Any": "", "Afrikaans": "af", "Aleut": "ale", "Arabic": "ar", "Arapaho": "arp", "Bodo": "brx", "Breton": "br", "Bulgarian": "bg", "Caló": "rmr", "Catalan": "ca", "Cebuano": "ceb", "Chinese": "zh", "Czech": "cs", "Danish": "da", "Dutch": "nl", "English": "en", "Esperanto": "eo", "Estonian": "et", "Farsi": "fa", "Finnish": "fi", "French": "fr", "Frisian": "fy", "Friulian": "fur", "Gaelic, Scottish": "gla", "Galician": "gl", "Gamilaraay": "kld", "German": "de", "Greek": "el", "Greek, Ancient": "grc", "Hebrew": "he", "Hungarian": "hu", "Icelandic": "is", "Iloko": "ilo", "Interlingua": "ia", "Inuktitut": "iu", "Irish": "ga", "Italian": "it", "Japanese": "ja", "Kashubian": "csb", "Khasi": "kha", "Korean": "ko", "Latin": "la", "Lithuanian": "lt", "Maori": "mi", "Mayan Languages": "myn", "Middle English": "enm", "Nahuatl": "nah", "Napoletano-Calabrese": "nap", "Navajo": "nav", "North American Indian": "nai", "Norwegian": "no", "Occitan": "oc", "Ojibwa": "oji", "Old English": "ang", "Polish": "pl", "Portuguese": "pt", "Romanian": "ro", "Russian": "ru", "Sanskrit": "sa", "Serbian": "sr", "Slovenian": "sl", "Spanish": "es", "Swedish": "sv", "Tagabawa": "bgs", "Tagalog": "tl", "Telugu": "te", "Welsh": "cy", "Yiddish": "yi" } # Allows to choose the wanted results numberp (10 by 10) queryNbr = gui.comboBox( widget=self.queryBox, master=self, value="nbr_results", items=[ "10", "20", "30", "40", "50", "60", "70", "80", "90", "100", "200", "300", "400", "500", "1000" ], sendSelectedValue=True, orientation="horizontal", label="Number of results: ", labelWidth=120, tooltip=("Please select the desired search.\n"), ) # Reasearch button # Uses "searchFunction" attribut self.searchButton = gui.button( widget=self.queryBox, master=self, label="Search", callback=self.search, tooltip="Connect Genius and make a research", ) self.titleListbox = gui.listBox( widget=self.queryBox, master=self, value="selectedTitles", # setting (list) labels="titleLabels", # setting (list) callback=lambda: self.addButton.setDisabled(self.selectedTitles == list()), tooltip="The list of titles whose content will be imported", ) self.titleListbox.setMinimumHeight(150) self.titleListbox.setSelectionMode(3) boxbutton = gui.widgetBox( widget=self.queryBox, box=False, orientation='horizontal', ) # Add text button self.addButton = gui.button( widget=boxbutton, master=self, label=u'Add to corpus', callback=self.add, tooltip=(u"Move the selected book downward in your corpus."), ) self.addButton.setDisabled(True) # Clear button # Uses "clearResults" function self.clearButton = gui.button( widget=boxbutton, master=self, label="Clear results", callback=self.clearResults, tooltip="Clear results", ) self.clearButton.setDisabled(True) gui.separator(widget=self.queryBox, height=3) # area where confirmed texts are moved and stocked mytitleBox = gui.widgetBox( widget=self.controlArea, box="Corpus", orientation="vertical", ) self.mytitleListbox = gui.listBox( widget=mytitleBox, master=self, value="myTitles", labels="mytitleLabels", callback=lambda: self.removeButton.setDisabled(self.myTitles == list()), tooltip="The list of books which will be imported", ) self.mytitleListbox.setMinimumHeight(150) self.mytitleListbox.setSelectionMode(3) boxbutton2 = gui.widgetBox( widget=mytitleBox, box=False, orientation='horizontal', ) # Remove text button self.removeButton = gui.button( widget=boxbutton2, master=self, label=u'Remove from corpus', callback=self.remove, tooltip=(u"Remove the selected book from your corpus."), ) self.removeButton.setDisabled(True) # Delete all confirmed texts button self.clearmyBasketButton = gui.button( widget=boxbutton2, master=self, label=u'Clear corpus', callback=self.clearmyBasket, tooltip=(u"Remove all books from your corpus."), ) self.clearmyBasketButton.setDisabled(True) gui.separator(widget=mytitleBox, height=3) gui.rubber(self.controlArea) #---------------------------------------------------------------------- # Draw Info box and Send button self.sendButton.draw() self.searchButton.setDefault(True) self.infoBox.draw() # Update the selections list self.updateMytitleLabels() # Send data if autoSend. self.sendButton.sendIf() # checks if the cache exists self.check_cache() def check_cache(self): """changes layout according to the cache existens """ # disables the search button if cache does not exists if not GutenbergCache.exists(): # disables the search button if not self.queryBox.setDisabled(True) self.infoBox.setText( "Cache must be generated before first launch, it can take up to 10min", "warning") # disables the the cache generation button if it does exists else: self.cacheGenerationButton.setDisabled(True) def generate_cache(self): """generates the cache """ if not GutenbergCache.exists(): try: self.infoBox.setText( "The cache is being generated. This can take up to 10min.", "warning") GutenbergCache.create(refresh=True, download=True, unpack=True, parse=True, deleteTemp=True) self.infoBox.setText("Cache generated!") self.cacheGenerationButton.setDisabled(True) self.queryBox.setEnabled(True) except Exception as exc: print(exc) self.infoBox.setText( "An error occurred while building the cache", "error") else: self.infoBox.setText("The cache already exists.") def search(self): """ Parse a query string and do a search in the Gutenberg cache """ query_string = self.titleQuery query_author = self.authorQuery language = self.lang_dict[self.langQuery] # informs the user that he didn't change anything if self.langQuery == 'Any' and query_string == '' and self.authorQuery == '': self.infoBox.setText( "You can't search only by language, if it's set to Any", "warning") else: # Recode author to name, first_name if len(query_author.split()) == 2: if "," not in query_author: query_author = "%, ".join(query_author.split()[::-1]) # parse query and lookup in gutenbergcache cache = GutenbergCache.get_cache() # searches the database try: query_results = cache.native_query(sql_query=""" /* Creates a new table with one author per book by selecting the greatest author id */ WITH unique_book_author AS (SELECT * FROM book_authors WHERE authorid IN (SELECT MAX(authorid) FROM book_authors GROUP BY bookid)) /* Selects title, author, gutenberg id and language */ SELECT titles.name, authors.name, books.gutenbergbookid, languages.name FROM titles /* Merges every needed table into one on shared attributes */ INNER JOIN books ON books.id = titles.bookid INNER JOIN unique_book_author ON books.id = unique_book_author.bookid INNER JOIN authors ON authors.id = unique_book_author.authorid INNER JOIN languages ON books.languageid = languages.id /* Matches users query using % wildcard for more permissive query */ WHERE upper(titles.name) LIKE "%{title}%" AND upper(authors.name) LIKE "%{author}%" AND languages.name LIKE "%{lang}%" LIMIT {limit} """.format(title=query_string, author=query_author, lang=language, limit=self.nbr_results)) except Exception as exc: print(exc) self.infoBox.setText( "An error occurred while interrogating the cache.", "error") return # get the results Results = list(query_results) self.searchResults = list() # creates better results for result in Results: result = list(result) # replaces all newlines types result[0] = re.sub(r'[\n\r]+', r', ', result[0]) # recodes athor from: name, first_name to: fisrt_name name result[1] = " ".join(result[1].split(", ")[::-1]) # gets the key from the lang_dict for the coresponding language abbreviation result[3] = [ key for key, value in self.lang_dict.items() if value == result[3] ][0] self.searchResults.append(result) # display info message n_results = len(self.searchResults) self.infoBox.setText("{n} result{s} have been found".format( n=n_results, s="s" if n_results > 0 else "")) self.clearResults() # Update the results list with the search results # in order to display them for idx in self.searchResults: result_string = "{title} — {author} — {lang}".format( title=idx[0], author=idx[1], lang=idx[3]) self.titleLabels.append(result_string) self.titleLabels = self.titleLabels self.clearButton.setDisabled(False) self.addButton.setDisabled(self.selectedTitles == list()) self.controlArea.setDisabled(False) # Function clearing the results list def clearResults(self): """Clear the results list""" del self.titleLabels[:] self.titleLabels = self.titleLabels self.clearButton.setDisabled(True) self.addButton.setDisabled(self.titleLabels == list()) # Add texts function def add(self): """Add texts in your selection """ for selectedTitle in self.selectedTitles: titleData = self.searchResults[selectedTitle] if titleData not in self.myBasket: self.myBasket.append(titleData) self.updateMytitleLabels() self.sendButton.settingsChanged() # Update selections function def updateMytitleLabels(self): self.mytitleLabels = list() for titleData in self.myBasket: result_string = "{title} — {author} — {lang}".format( title=titleData[0], author=titleData[1], lang=titleData[3]) self.mytitleLabels.append(result_string) self.mytitleLabels = self.mytitleLabels self.clearmyBasketButton.setDisabled(self.myBasket == list()) self.removeButton.setDisabled(self.myTitles == list()) def remove(self): """Remove the selected books in the user's basket """ self.myBasket = [ title for idx, title in enumerate(self.myBasket) if idx not in self.myTitles ] self.updateMytitleLabels() self.sendButton.settingsChanged() # Clear selections function def clearmyBasket(self): """Remove all texts in your selection """ self.mytitleLabels = list() self.myBasket = list() self.sendButton.settingsChanged() self.clearmyBasketButton.setDisabled(True) # Function computing results then sending them to the widget output def sendData(self): """Compute result of widget processing and send to output""" # Skip if title list is empty: if self.myBasket == list(): self.infoBox.setText( "Your corpus is empty, please add some books first", "warning") return # Clear created Inputs. self.clearCreatedInputs() self.controlArea.setDisabled(True) # Initialize progress bar. progressBar = ProgressBar( self, iterations=len(self.myBasket), ) text_content = list() annotations = list() try: # Retrieve selected texts from gutenberg for text in self.myBasket: gutenberg_id = text[2] # Get the text with Gutenbergpy gutenberg_text = gutenbergpy.textget.strip_headers( gutenbergpy.textget.get_text_by_id(gutenberg_id)).decode( "utf-8") text_content.append(gutenberg_text) # populate the annotation list annotations.append([text[0], text[1], text[3]]) progressBar.advance() # If an error occurs (e.g. http error, or memory error)... except Exception as exc: # Set Info box and widget to "error" state. self.infoBox.setText("Couldn't download data from Gutenberg", "error") self.controlArea.setDisabled(False) print(exc) return # Store downloaded text strings in input objects... for text in text_content: newInput = Input(text, self.captionTitle) self.createdInputs.append(newInput) # If there's only one text, the widget's output is the created Input. if len(self.createdInputs) == 1: self.segmentation = self.createdInputs[0] # Otherwise the widget"s output is a concatenation. else: self.segmentation = Segmenter.concatenate( self.createdInputs, self.captionTitle, import_labels_as=None, ) # Annotate segments with book metadata for idx, segment in enumerate(self.segmentation): segment.annotations.update({"title": annotations[idx][0]}) segment.annotations.update({"author": annotations[idx][1]}) segment.annotations.update({"language": annotations[idx][2]}) self.segmentation[idx] = segment # Clear progress bar. progressBar.finish() self.controlArea.setDisabled(False) # Set status to OK and report data size... message = "%i segment@p sent to output " % len(self.segmentation) message = pluralize(message, len(self.segmentation)) numChars = 0 for segment in self.segmentation: segmentLength = len(Segmentation.get_data(segment.str_index)) numChars += segmentLength message += "(%i character@p)." % numChars message = pluralize(message, numChars) self.infoBox.setText(message) self.send("Gutenberg importation", self.segmentation, self) self.sendButton.resetSettingsChangedFlag() def clearCreatedInputs(self): """Delete all Input objects that have been created.""" for i in self.createdInputs: Segmentation.set_data(i[0].str_index, None) del self.createdInputs[:] # The following method needs to be copied verbatim in # every Textable widget that sends a segmentation... def setCaption(self, title): if 'captionTitle' in dir(self): changed = title != self.captionTitle super().setCaption(title) if changed: self.sendButton.settingsChanged() else: super().setCaption(title)
class ExtractCSV(OWTextableBaseWidget): """Textable widget for to extract CSV usign the CSV module and Sniffer.""" #---------------------------------------------------------------------- # Widget's metadata... name = "Extract CSV" description = "Extract tabulated data as a Textable Segmentation" icon = "icons/extractcsv.png" priority = 21 # TODO #---------------------------------------------------------------------- # Channel definitions... inputs = [("CSV Data", Segmentation, "inputData")] outputs = [("CSV Segmentation", Segmentation)] #---------------------------------------------------------------------- # Layout parameters... want_main_area = False #---------------------------------------------------------------------- # Settings... settingsHandler = VersionedSettingsHandler( version=__version__.rsplit(".", 1)[0] ) autoSend = settings.Setting(False) content_column = settings.Setting(0) deleteQuotes = settings.Setting(False) def __init__(self): """Widget creator.""" super().__init__() # Other attributes... self.inputSeg = None self.outputSeg = None self.dialect = None self.selectedHeader = None self.csvSeg = list() # list of deleted segments self.contentIsNone = list() # list for gui self.headerList = list() self.content_column = 0 self.headerEdit = "" # those are for the rename function self.renamedHeader = None self.isRenamed = False self.dict_keys = list() # preprocess self.deleteQuotes = False # Next two instructions are helpers from TextableUtils. Corresponding # interface elements are declared here and actually drawn below (at # their position in the UI)... self.infoBox = InfoBox(widget=self.controlArea) self.sendButton = SendButton( widget=self.controlArea, master=self, callback=self.sendData, infoBoxAttribute="infoBox", sendIfPreCallback=None, ) #self.header_there = False #---------------------------------------------------------------------- # User interface... # preprocess box... self.preprocessBox = gui.widgetBox( widget=self.controlArea, box="Preprocess", orientation="vertical", ) # check box... self.checkQuotes = gui.checkBox( widget=self.preprocessBox, master=self, value='deleteQuotes', label='delete quotation marks', callback=self.delete_quotes, ) # main box... self.mainBox = gui.widgetBox( widget=self.controlArea, box="Click to select a header to modify", orientation="vertical", ) # List of all the headers (named with numbers if None) self.headerListbox = gui.listBox( widget=self.mainBox, master=self, value="selectedHeader", labels="headerList", callback=self.update_gui, selectionMode=1, # can only choose one item tooltip="list of all your headers", ) # set "rename" button (must be aside the list) self.renameHeader = gui.button( widget=self.mainBox, master=self, label="rename", callback=self.set_renamebox, tooltip="click to rename header" ) # set "use as content" button (must be aside the list) self.iscontentHeader = gui.button( widget=self.mainBox, master=self, label="use as content", callback=self.content_changed, tooltip="click to select as content" ) #---------------------------------------------------------------------- # rename box... self.renameBox = gui.widgetBox( widget=self.controlArea, box='Rename header', orientation='horizontal', addSpace=True, ) gui.separator(widget=self.renameBox, height=3) self.headerEditLine = gui.lineEdit( widget=self.renameBox, master=self, value='headerEdit', orientation='horizontal', label='New header:', tooltip=( "Rename the selected header." ), callback=lambda: self.renameButton.setDisabled(not self.headerEdit), ) self.renameButton = gui.button( widget=self.renameBox, master=self, label="rename", callback=self.rename, tooltip="click to rename header" ) self.cancelButton = gui.button( widget=self.renameBox, master=self, label="cancel", callback=self.cancel, tooltip="click to cancel renaming" ) #---------------------------------------------------------------------- # interface parameters... self.update_gui() self.renameBox.setVisible(False) gui.rubber(self.controlArea) # Now Info box and Send button must be drawn... self.sendButton.draw() self.infoBox.draw() self.infoBox.setText("Widget needs input", "warning") # Send data if autoSend. self.sendButton.sendIf() #---------------------------------------------------------------------- def update_gui(self): if len(self.selectedHeader)==0: self.iscontentHeader.setDisabled(True) self.renameHeader.setDisabled(True) else: self.iscontentHeader.setDisabled(False) self.renameHeader.setDisabled(False) def content_changed(self): self.content_column = int(self.selectedHeader[0]) self.treat_input() return def delete_quotes(self): self.treat_input() def set_renamebox(self): # take selectedHeader self.renamedHeader = int(self.selectedHeader[0]) # appear rename gui self.renameBox.setVisible(True) self.renameButton.setDisabled(True) # disable other self.iscontentHeader.setDisabled(True) self.renameHeader.setDisabled(True) self.headerListbox.setDisabled(True) self.checkQuotes.setDisabled(True) def rename(self): # rename for key in self.dict_keys: # change my header name if self.dict_keys.index(key) == self.renamedHeader: self.dict_keys[self.dict_keys.index(key)] = self.headerEdit # implement check value self.isRenamed = True # and treat again self.treat_input() # here we get back to normal gui self.renameBox.setVisible(False) self.headerListbox.setDisabled(False) self.checkQuotes.setDisabled(False) self.update_gui() # clear value self.headerEdit = "" def cancel(self): # here we get back to normal gui self.renameBox.setVisible(False) self.headerListbox.setDisabled(False) self.update_gui() # clear value self.headerEdit = "" def treat_input(self): # Check that there's an input... if self.inputSeg is None: self.infoBox.setText("Widget needs input", "warning") del self.headerList[:] self.headerList = self.headerList return # Initialize progress bar. self.infoBox.setText( u"Processing, please wait...", "warning", ) self.controlArea.setDisabled(True) progressBar = ProgressBar(self, iterations=len(self.inputSeg)) # clear lists del self.csvSeg[:] del self.contentIsNone[:] # Process each input segment... for segment in self.inputSeg: # Input segment attributes... inputContent = segment.get_content() if not self.deleteQuotes == False : inputContent = inputContent.replace('"',"") inputAnnotations = segment.annotations inputStrIdx = segment.str_index inputStart = segment.start or 0 inputEnd = segment.end or len(inputContent) #Call data processing csv_stream = io.StringIO(inputContent) dialect = sniffer.sniff(csv_stream.readline()) dialect.quoting=csv.QUOTE_NONE csv_stream.seek(0) my_reader = csv.reader(csv_stream, dialect) position = 0 # Process each seg in inputContent for seg in inputContent: segAnnotations = inputAnnotations.copy() # This will launch if sniffer detects a header in the content. if sniffer.has_header(inputContent) == True: # go back to the start otherwise we're going to start from the # second row csv_stream.seek(0) # the header row is defined here. if self.isRenamed == False : self.dict_keys = next(my_reader) for key in self.dict_keys: # this is position of first content # TODO : separator length (if not 1) position += (len(key) + 1) else : input_keys = next(my_reader) for key in input_keys: # this is position of first content # TODO : separator length (if not 1) position += (len(key) + 1) # This will launch if sniffer does not detect a header # in the content. if sniffer.has_header(inputContent) == False: # go back to the start otherwise we're going to start from the # second row. we do this here even though we don't really care # about the first row simply because in general we consider the # first row to not have any missing values csv_stream.seek(0) first_row = next(my_reader) n_cols = len(first_row) if self.isRenamed == False : self.dict_keys = list() for item in range(1, n_cols+1): self.dict_keys.append(str(item)) csv_stream.seek(0) # clear the list before appending del self.headerList[:] for key in self.dict_keys: # appends the headers to the gui list if self.dict_keys.index(key) == self.content_column: self.headerList.append(str(key)+"(*content)") self.headerList = self.headerList else : self.headerList.append(str(key)) self.headerList = self.headerList for idx, row in enumerate(my_reader, start=2): # Get old annotations in new dictionary oldAnnotations = inputAnnotations.copy() segAnnotations = dict() # initiate next row starting position next_position = position for key in oldAnnotations.keys(): segAnnotations[key] = oldAnnotations[key] # This is the main part where we transform our data into # annotations. for key in self.dict_keys: # segAnnotations["length"] = position # segAnnotations["row"] = str(row) # if column is content (first column (0) by default) if self.dict_keys.index(key) == self.content_column: # put value as content content = row[self.dict_keys.index(key)] # else we put value in annotation else: # only if value is not None if len(row[self.dict_keys.index(key)]) != 0 : segAnnotations[key] = row[self.dict_keys.index(key)] # implement position and next_position depending on # content column if self.dict_keys.index(key) < self.content_column: position += len(row[self.dict_keys.index(key)]) + 1 next_position += len(row[self.dict_keys.index(key)]) + 1 if self.dict_keys.index(key) >= self.content_column: next_position += len(row[self.dict_keys.index(key)]) + 1 if len(content) != 0: self.csvSeg.append( Segment( str_index = inputStrIdx, start = position, end = position + len(content), annotations = segAnnotations ) ) else : # if no content, add idx of the row and do not append # TODO : something with contentIsNone self.contentIsNone.append(idx) # initiate new row starting position position = next_position progressBar.advance() unSeg = len(self.csvSeg) # Set status to OK and report segment analyzed... message = "%i segment@p analyzed." % unSeg message = pluralize(message, unSeg) message += " (Ignored %i segment@p with no content)" % \ len(self.contentIsNone) message = pluralize(message, len(self.contentIsNone)) self.infoBox.setText(message) # Clear progress bar. progressBar.finish() self.controlArea.setDisabled(False) self.sendButton.resetSettingsChangedFlag() self.sendButton.sendIf() def inputData(self, newInput): """Process incoming data.""" self.inputSeg = newInput self.infoBox.inputChanged() del self.dict_keys[:] self.isRenamed = False self.sendButton.sendIf() self.treat_input() def sendData(self): """Compute result of widget processing and send to output""" # Check that there's an input... if self.inputSeg is None: self.infoBox.setText("Widget needs input", "warning") del self.headerList[:] self.headerList = self.headerList self.send("CSV Segmentation", None, self) return # Initialize progress bar. self.infoBox.setText( u"Processing, please wait...", "warning", ) self.controlArea.setDisabled(True) progressBar = ProgressBar(self, iterations=len(self.inputSeg)) # Treat... for segment in self.csvSeg: pass progressBar.advance() # Set status to OK and report data size... outputSeg = Segmentation(self.csvSeg, label=self.captionTitle) if len(self.contentIsNone) == 0 : message = "%i segment@p sent to output." % len(outputSeg) message = pluralize(message, len(outputSeg)) self.infoBox.setText(message) # message if one or more segments has no content and has been ignored elif len(self.contentIsNone) == 1: message = "%i segment@p sent to output. (ignored %i segment with \ no content)" % (len(outputSeg), len(self.contentIsNone)) message = pluralize(message, len(outputSeg)) self.infoBox.setText(message) else : message = "%i segment@p sent to output. (ignored %i segments with \ no content)" % (len(outputSeg), len(self.contentIsNone)) message = pluralize(message, len(outputSeg)) self.infoBox.setText(message) # Clear progress bar. progressBar.finish() self.controlArea.setDisabled(False) # Send data to output... self.send("CSV Segmentation", outputSeg, self) self.sendButton.resetSettingsChangedFlag() # The following method needs to be copied verbatim in # every Textable widget that sends a segmentation... def setCaption(self, title): if 'captionTitle' in dir(self): changed = title != self.captionTitle super().setCaption(title) if changed: self.sendButton.settingsChanged() else: super().setCaption(title)
class Gutenberg(OWTextableBaseWidget): """Textable widget for importing clean texts from Gutenberg (https://www.gutenberg.org/) """ #---------------------------------------------------------------------- # Widget's metadata... name = "Gutenberg" description = "Gutenberg caching and importation" icon = "icons/gutenberg.png" priority = 10 #---------------------------------------------------------------------- # Channel definitions... inputs = [] outputs = [("Gutenberg importation", Segmentation)] #---------------------------------------------------------------------- # Layout parameters... want_main_area = False #---------------------------------------------------------------------- # Settings... settingsHandler = VersionedSettingsHandler( version=__version__.rsplit(".", 1)[0]) # Saved settings autoSend = settings.Setting(False) myBasket = settings.Setting([]) def __init__(self): """Widget creator.""" super().__init__() # ATTRIBUTS # searchFunction self.searchResults = None self.inputSeg = None # newQuery = attribut box lineEdit (search something) self.titleQuery = '' self.authorQuery = '' self.nbr_results = 10 # Results box attributs self.titleLabels = list() self.selectedTitles = list() # selections box attributs self.myTitles = list() self.mytitleLabels = list() # stock all the inputs (songs) in a list self.createdInputs = list() # Next two instructions are helpers from TextableUtils. Corresponding # interface elements are declared here and actually drawn below (at # their position in the UI)... self.infoBox = InfoBox(widget=self.controlArea) self.sendButton = SendButton( widget=self.controlArea, master=self, callback=self.sendData, infoBoxAttribute="infoBox", ) #---------------------------------------------------------------------- # User interface... # Create the working area queryBox = gui.widgetBox( widget=self.controlArea, box="Search books", orientation="vertical", ) self.cacheGenerationButton = gui.button( widget=queryBox, master=self, label="Generate cache", callback=self.generate_cache, tooltip="Generate the gutenberg cache, this might take a while...", ) # Allows to enter specific text to the research # Uses "newQuery" attribut gui.lineEdit( widget=queryBox, master=self, value='titleQuery', orientation='horizontal', label=u"Title: ", labelWidth=120, tooltip=("Enter a string"), ) # gui.lineEdit( # widget=queryBox, # master=self, # value='authorQuery', # orientation='horizontal', # label=u"Author: ", # labelWidth=120, # tooltip=("Enter a string"), # ) # Allows to choose the wanted results numberp (10 by 10) queryNbr = gui.comboBox( widget=queryBox, master=self, value="nbr_results", items=[ "10", "20", "30", "40", "50", "60", "70", "80", "90", "100", "200", "300", "400", "500", "1000" ], sendSelectedValue=True, orientation="horizontal", label="Number of results: ", labelWidth=120, tooltip=("Please select the desired search.\n"), ) # Reasearch button # Uses "searchFunction" attribut self.searchButton = gui.button( widget=queryBox, master=self, label="Search", callback=self.search, tooltip="Connect Genius and make a research", ) self.titleListbox = gui.listBox( widget=queryBox, master=self, value="selectedTitles", # setting (list) labels="titleLabels", # setting (list) callback=lambda: self.addButton.setDisabled(self.selectedTitles == list()), tooltip="The list of titles whose content will be imported", ) self.titleListbox.setMinimumHeight(150) self.titleListbox.setSelectionMode(3) boxbutton = gui.widgetBox( widget=queryBox, box=False, orientation='horizontal', ) # Add songs button self.addButton = gui.button( widget=boxbutton, master=self, label=u'Add to corpus', callback=self.add, tooltip=(u"Move the selected book downward in your corpus."), ) self.addButton.setDisabled(True) # Clear button # Uses "clearResults" function self.clearButton = gui.button( widget=boxbutton, master=self, label="Clear results", callback=self.clearResults, tooltip="Clear results", ) self.clearButton.setDisabled(True) gui.separator(widget=queryBox, height=3) # area where confirmed songs are moved and stocked mytitleBox = gui.widgetBox( widget=self.controlArea, box="Corpus", orientation="vertical", ) self.mytitleListbox = gui.listBox( widget=mytitleBox, master=self, value="myTitles", labels="mytitleLabels", callback=lambda: self.removeButton.setDisabled(self.myTitles == list()), tooltip="The list of books which will be imported", ) self.mytitleListbox.setMinimumHeight(150) self.mytitleListbox.setSelectionMode(3) boxbutton2 = gui.widgetBox( widget=mytitleBox, box=False, orientation='horizontal', ) # Remove songs button self.removeButton = gui.button( widget=boxbutton2, master=self, label=u'Remove from corpus', callback=self.remove, tooltip=(u"Remove the selected book from your corpus."), ) self.removeButton.setDisabled(True) # Delete all confirmed texts button self.clearmyBasketButton = gui.button( widget=boxbutton2, master=self, label=u'Clear corpus', callback=self.clearmyBasket, tooltip=(u"Remove all books from your corpus."), ) self.clearmyBasketButton.setDisabled(True) gui.separator(widget=mytitleBox, height=3) gui.rubber(self.controlArea) #---------------------------------------------------------------------- # Draw Info box and Send button self.sendButton.draw() self.searchButton.setDefault(True) self.infoBox.draw() # Update the selections list self.updateMytitleLabels() # Send data if autoSend. self.sendButton.sendIf() def generate_cache(self): GutenbergCache.create(refresh=True, download=True, unpack=True, parse=True, deleteTemp=True) def search(self): """ Parse a query string and do a search in the Gutenberg cache """ query_string = self.titleQuery if query_string: # parse query and lookup in gutenbergcache cache = GutenbergCache.get_cache() query_results = cache.native_query( sql_query= "select * from titles where upper(name) like upper('%{query}%') limit {limit}" .format(query=query_string, limit=self.nbr_results)) # get the results self.searchResults = list(query_results) # display info message n_results = len(self.searchResults) self.infoBox.setText("{n} result{s} have been found".format( n=n_results, s="s" if n_results > 0 else "")) # TODO: display results # Update the results list with the search results # in order to display them for idx in self.searchResults: result_string = str(idx[1]) self.titleLabels.append(result_string) self.titleLabels = self.titleLabels self.clearButton.setDisabled(False) self.addButton.setDisabled(self.selectedTitles == list()) self.controlArea.setDisabled(False) else: self.infoBox.setText("You didn't search anything", "warning") # Function clearing the results list def clearResults(self): """Clear the results list""" del self.titleLabels[:] self.titleLabels = self.titleLabels self.clearButton.setDisabled(True) self.addButton.setDisabled(self.titleLabels == list()) # Add texts function def add(self): """Add songs in your selection """ for selectedTitle in self.selectedTitles: titleData = self.searchResults[selectedTitle] if titleData not in self.myBasket: self.myBasket.append(titleData) self.updateMytitleLabels() self.sendButton.settingsChanged() # Update selections function def updateMytitleLabels(self): self.mytitleLabels = list() for titleData in self.myBasket: result_string = titleData[1] self.mytitleLabels.append(result_string) self.mytitleLabels = self.mytitleLabels self.clearmyBasketButton.setDisabled(self.myBasket == list()) self.removeButton.setDisabled(self.myTitles == list()) # fonction qui retire la selection de notre panier def remove(self): """Remove the selected songs in your selection """ self.myBasket = [ title for idx, title in enumerate(self.myBasket) if idx not in self.myTitles ] self.updateMytitleLabels() self.sendButton.settingsChanged() # Clear selections function def clearmyBasket(self): """Remove all songs in your selection """ self.mytitleLabels = list() self.myBasket = list() self.sendButton.settingsChanged() self.clearmyBasketButton.setDisabled(True) # Function computing results then sending them to the widget output def sendData(self): """Compute result of widget processing and send to output""" # Skip if title list is empty: if self.myBasket == list(): self.infoBox.setText( "Your corpus is empty, please add some books first", "warning") return # Clear created Inputs. self.clearCreatedInputs() self.controlArea.setDisabled(True) # Initialize progress bar. progressBar = ProgressBar( self, iterations=len(self.myBasket), ) selectedTexts = list() text_content = list() annotations = list() # get the Gutenberg cache cache = GutenbergCache.get_cache() try: # TODO: Retrieve selected texts from gutenberg for text in self.myBasket: # Get the id of the text query_id = cache.native_query( sql_query= "select gutenbergbookid from books where id == {selected_id}" .format(selected_id=text[2])) gutenberg_id = list(query_id) # Get the text with Gutenbergpy gutenberg_text = gutenbergpy.textget.strip_headers( gutenbergpy.textget.get_text_by_id(gutenberg_id[0][0])) text_content.append(gutenberg_text) annotations.append(text[1]) progressBar.advance() # If an error occurs (e.g. http error, or memory error)... except Exception: # Set Info box and widget to "error" state. self.infoBox.setText("Couldn't download data from Gutenberg", "error") self.controlArea.setDisabled(False) return # TODO: send gutenberg texts as output # Store downloaded lyrics strings in input objects... for text in text_content: newInput = Input(text, self.captionTitle) self.createdInputs.append(newInput) # If there"s only one play, the widget"s output is the created Input. if len(self.createdInputs) == 1: self.segmentation = self.createdInputs[0] # Otherwise the widget"s output is a concatenation... else: self.segmentation = Segmenter.concatenate( self.createdInputs, self.captionTitle, import_labels_as=None, ) # TODO: annotate with book metadata # Annotate segments... for idx, segment in enumerate(self.segmentation): segment.annotations.update({"title": annotations[idx]}) self.segmentation[idx] = segment # Clear progress bar. progressBar.finish() self.controlArea.setDisabled(False) # Set status to OK and report data size... message = "%i segment@p sent to output " % len(self.segmentation) message = pluralize(message, len(self.segmentation)) numChars = 0 for segment in self.segmentation: segmentLength = len(Segmentation.get_data(segment.str_index)) numChars += segmentLength message += "(%i character@p)." % numChars message = pluralize(message, numChars) self.infoBox.setText(message) self.send("Gutenberg importation", self.segmentation, self) self.sendButton.resetSettingsChangedFlag() def clearCreatedInputs(self): """Delete all Input objects that have been created.""" for i in self.createdInputs: Segmentation.set_data(i[0].str_index, None) del self.createdInputs[:] # The following method needs to be copied verbatim in # every Textable widget that sends a segmentation... def setCaption(self, title): if 'captionTitle' in dir(self): changed = title != self.captionTitle super().setCaption(title) if changed: self.sendButton.settingsChanged() else: super().setCaption(title)
class Treetagger(OWTextableBaseWidget): """Orange widget for POS-tagging and lemmatization with Treetagger""" name = "Treetagger" description = "POS-tagging and lemmatization with Treetagger" icon = "icons/treetagger.svg" priority = 2003 inputs = [("Segmentation", Segmentation, "inputData")] outputs = [("Tagged data", Segmentation)] settingsHandler = VersionedSettingsHandler( version=__version__.rsplit(".", 1)[0]) language = settings.Setting(0) replaceUnknown = settings.Setting(False) outputFormat = settings.Setting("segment into words") want_main_area = False configFilePath = os.path.normpath( appdirs.user_data_dir("textable", "langtech") + "/treetagger_path") def __init__(self, *args, **kwargs): """Initialize a Message widget""" super().__init__(*args, **kwargs) # Other attributes... self.segmentation = None self.createdInputs = list() self.noLanguageParameterWarning = ( "Please make sure that at least one language parameter " "file is installed in your Treetagger 'lib' directory, " "then click 'Reload language parameter files'.") self.noTreetaggerPathWarning = ( "Please click 'Locate Treetagger' below and select the " "base directory of a valid Treetagger distribution.") self.TreetaggerPath = (treetaggerwrapper.locate_treetagger() or self.lookupSavedTreetaggerPath()) self.infoBox = InfoBox(widget=self.controlArea) self.sendButton = SendButton(widget=self.controlArea, master=self, callback=self.sendData, infoBoxAttribute=u"infoBox", sendIfPreCallback=self.updateGUI) gui.separator(self.controlArea, height=3) self.optionsBox = gui.widgetBox( self.controlArea, u"Options", ) self.languageCombobox = gui.comboBox( widget=self.optionsBox, master=self, value="language", items=list(), sendSelectedValue=True, orientation=u"horizontal", label="Input language:", labelWidth=180, callback=self.sendButton.settingsChanged, tooltip=(u"Select the language of the input text."), ) self.languageCombobox.setMinimumWidth(120) gui.separator(self.optionsBox, height=3) gui.comboBox( widget=self.optionsBox, master=self, value="outputFormat", items=[ "segment into words", "add XML tags", ], sendSelectedValue=True, orientation=u"horizontal", label="Output format:", labelWidth=180, callback=self.sendButton.settingsChanged, tooltip=( u"Select the format of the output:\n\n" u"Segment into words: each word is in a separate segment,\n" u"with lemma and POS-tag as annotations.\n\n" u"Add XML tags: output segments correspond to input segments\n" u"and each word is tagged in XML as a 'w' element with\n" u"lemma and POS-tag as attributes."), ) gui.separator(self.optionsBox, height=3) gui.checkBox( widget=self.optionsBox, master=self, value="replaceUnknown", label="Output token in place of [unknown] lemmas", callback=self.sendButton.settingsChanged, tooltip=( u"For out-of-vocabulary words, the word form is used as the\n" u"lemma (in place of Treetagger's default 'unknown' code)."), ) gui.rubber(self.controlArea) self.sendButton.draw() self.infoBox.draw() self.locateTreetaggerBox = gui.widgetBox( self.controlArea, addSpace=False, ) gui.separator(self.locateTreetaggerBox, height=3) self.treetaggerButton = gui.button( widget=self.locateTreetaggerBox, master=self, label="Locate Treetagger", callback=self.validateTreetagger, tooltip=( u"Click to select the location of the Treetagger base\n" u"directory (containing the 'lib' and 'bin' subdirectories)."), ) self.sendButton.sendIf() self.adjustSizeWithTimer() def inputData(self, inputData): """Process incoming data.""" self.segmentation = inputData self.infoBox.inputChanged() self.sendButton.sendIf() def sendData(self): # Clear created Inputs... self.clearCreatedInputs() if not self.TreetaggerPath: self.infoBox.setText(self.noTreetaggerPathWarning, "warning") self.send("Tagged data", None) return elif not self.getAvailableLanguages(): self.infoBox.setText(self.noLanguageParameterWarning, "warning") self.send("Tagged data", None) return elif not self.segmentation: self.infoBox.setText(u"Widget needs input", "warning") self.send("Tagged data", None) return # Initialize progress bar. self.infoBox.setText(u"Processing, please wait...", "warning") self.controlArea.setDisabled(True) self.progressBar = ProgressBar(self, iterations=5) # Create a copy of input seg, storing annotations in temp attr... copy_of_input_seg = Segmentation() copy_of_input_seg.label = self.segmentation.label for seg_idx, segment in enumerate(self.segmentation): attr = " ".join([ "%s=%s" % ( ''.join(c for c in unicodedata.normalize('NFD', item[0]) if unicodedata.category(c) != 'Mn'), quoteattr(str(item[1])), ) for item in segment.annotations.items() ]) segment.annotations["tt_ax"] = attr copy_of_input_seg.append(segment) self.progressBar.advance() # Dump segmentation in unique string to avoid multiple calls to TT... concatenated_text = copy_of_input_seg.to_string( formatting="<ax_tt %(tt_ax)s>%(__content__)s</ax_tt>", display_all=True, ) self.progressBar.advance() # Tag the segmentation contents... tagopt = '-token -lemma -sgml -quiet' if self.replaceUnknown: tagopt += " -no-unknown" tagger = treetaggerwrapper.TreeTagger( TAGLANG=pycountry.languages.get(name=self.language).alpha_2, TAGOPT=tagopt, TAGDIR=self.TreetaggerPath, ) tagged_lines = tagger.tag_text( concatenated_text, notagurl=True, notagemail=True, notagip=True, notagdns=True, ) tagged_input = Input("\n".join(tagged_lines)) self.createdInputs.append(tagged_input) # Replace <unknown> with [unknown] and " with " then # re-segment to match the original segmentation structure. tagged_segmentation, _ = Segmenter.recode( tagged_input, substitutions=[ (re.compile(r"<unknown>"), "[unknown]"), (re.compile(r'"""'), '"""'), ], ) tagged_segmentation = Segmenter.import_xml(tagged_segmentation, "ax_tt") self.progressBar.advance() # Place each output line of Treetagger in an xml tag with annotations.. xml_segmentation, _ = Segmenter.recode( tagged_segmentation, substitutions=[ (re.compile(r"(.+)\t(.+)\t(.+?)(?=[\r\n])"), '<w lemma="&3" pos-tag="&2">&1</w>'), (re.compile(r'^\n|\n$'), ''), ], ) # Segment into individual tokens if XML output option is disabled... if self.outputFormat == "add XML tags": output_segmentation = xml_segmentation else: try: output_segmentation = Segmenter.import_xml( xml_segmentation, "w") except ValueError: self.infoBox.setText( "Please check that either the input contains well-formed " "XML, or it doesn't contain instances of '<' and '\x3e'", "error") self.send("Tagged data", None) self.progressBar.finish() self.controlArea.setDisabled(False) return self.progressBar.finish() self.controlArea.setDisabled(False) output_segmentation.label = self.captionTitle message = u'%i segment@p sent to output.' % len(output_segmentation) message = pluralize(message, len(output_segmentation)) self.infoBox.setText(message) self.send('Tagged data', output_segmentation, self) self.sendButton.resetSettingsChangedFlag() def updateGUI(self): """Update GUI state""" if self.TreetaggerPath: self.optionsBox.setDisabled(False) self.locateTreetaggerBox.setVisible(False) self.languageCombobox.clear() languages = self.getAvailableLanguages() if not languages: self.infoBox.setText(self.noLanguageParameterWarning, "warning") self.optionsBox.setDisabled(True) self.locateTreetaggerBox.setVisible(True) self.treetaggerButton.setText( "Reload language parameter files") else: self.language = self.language or languages[0] else: self.infoBox.setText(self.noTreetaggerPathWarning, "warning") self.optionsBox.setDisabled(True) self.locateTreetaggerBox.setVisible(True) self.adjustSizeWithTimer() def getAvailableLanguages(self): languages = list() for lang_code in sorted(treetaggerwrapper.g_langsupport): if lang_code.startswith("__"): continue try: treetaggerwrapper.TreeTagger( TAGLANG=lang_code, TAGDIR=self.TreetaggerPath, ) language = pycountry.languages.get(alpha_2=lang_code).name self.languageCombobox.addItem(language) languages.append(language) except: pass return languages def lookupSavedTreetaggerPath(self): """Look for a saved Treetagger base dir path in app data""" if os.path.exists(self.__class__.configFilePath): try: inputFile = open(self.__class__.configFilePath, "r") TreetaggerSavedPath = inputFile.read() inputFile.close() if self.checkTreetaggerPath(TreetaggerSavedPath): return TreetaggerSavedPath else: os.remove(self.__class__.configFilePath) return None except IOError: pass def validateTreetagger(self): """Respond to user actions needed to validate Treetagger path""" # If the Treetagger path is known, make sure there are language files... if self.TreetaggerPath: if self.getAvailableLanguages(): self.sendButton.settingsChanged() self.updateGUI() else: QMessageBox.warning(None, 'Textable', 'Language parameter files not found.', QMessageBox.Ok) return # Else if the path is not known... # First try to locate it automatically... TreetaggerPath = treetaggerwrapper.locate_treetagger() # If it fails, let the user locate it manually... if not (TreetaggerPath and self.checkTreetaggerPath(TreetaggerPath)): TreetaggerManualPath = os.path.normpath( str( QFileDialog.getExistingDirectory( self, u"Please locate Treetagger base directory"))) # If user selected a dir... if TreetaggerManualPath: # Check if selected dir contains Treetagger binary... if self.checkTreetaggerPath(TreetaggerManualPath): TreetaggerPath = TreetaggerManualPath else: QMessageBox.warning( None, 'Textable', 'Not a valid Treetagger base directory.', QMessageBox.Ok) # If a valid path was found somehow, save config to app data... if TreetaggerPath: try: user_data_editor_dir = os.path.normpath( self.__class__.configFilePath + "/../..") if not os.path.exists(user_data_editor_dir): os.makedirs(user_data_editor_dir) user_data_software_dir = os.path.normpath( self.__class__.configFilePath + "/..") if not os.path.exists(user_data_software_dir): os.makedirs(user_data_software_dir) outputFile = open(self.__class__.configFilePath, "w") outputFile.write(TreetaggerPath) outputFile.close() except IOError: pass self.TreetaggerPath = TreetaggerPath self.sendButton.settingsChanged() def checkTreetaggerPath(self, path): """Check if path is a valid Treetagger base dir""" return os.path.exists( os.path.normpath(path + "/bin/tree-tagger" + (".exe" if os.name == "nt" else ""))) def clearCreatedInputs(self): for i in self.createdInputs: Segmentation.set_data(i[0].str_index, None) del self.createdInputs[:] def onDeleteWidget(self): """Free memory when widget is deleted (overriden method)""" self.clearCreatedInputs() def setCaption(self, title): if 'captionTitle' in dir(self): changed = title != self.captionTitle super().setCaption(title) if changed: self.sendButton.settingsChanged() else: super().setCaption(title)
class TopicModels(OWTextableBaseWidget): """Textable widget for building topic models based on a term-document matrix """ #---------------------------------------------------------------------- # Widget"s metadata... name = "Topic Models" description = "Build topic models based on term-document matrices" icon = "icons/topic_models.svg" priority = 10 #---------------------------------------------------------------------- # Channel definitions... inputs = [("Textable Crosstab", PivotCrosstab, "input_data")] outputs = [ ("Term-topic Textable table", PivotCrosstab, widget.Default), ("Document-topic Textable table", PivotCrosstab), ("Term-topic Orange table", Orange.data.Table, widget.Default), ("Document-topic Orange table", Orange.data.Table), ] #---------------------------------------------------------------------- # Settings... settingsHandler = VersionedSettingsHandler( version=__version__.rsplit(".", 1)[0]) autoSend = settings.Setting(False) method = settings.Setting("Latent semantic indexing") numTopics = settings.Setting(10) want_main_area = False def __init__(self): """Widget creator.""" super().__init__() # Other attributes... self.inputTable = None self.listEntries = list() # Next two instructions are helpers from TextableUtils. Corresponding # interface elements are declared here and actually drawn below (at # their position in the UI)... self.infoBox = InfoBox(widget=self.controlArea) self.sendButton = SendButton( widget=self.controlArea, master=self, callback=self.send_data, infoBoxAttribute="infoBox", sendIfPreCallback=self.updateGUI, ) # User interface... # Filter box (advanced settings only) optionsBox = gui.widgetBox( widget=self.controlArea, box="Options", orientation="vertical", ) method_combo = gui.comboBox( widget=optionsBox, master=self, value="method", items=[ "Latent Dirichlet allocation", "Latent semantic indexing", "Correspondence analysis", ], sendSelectedValue=True, orientation="horizontal", label="Method:", labelWidth=120, callback=self.sendButton.settingsChanged, tooltip=("Please select the desired topic modelling method.\n"), ) method_combo.setMinimumWidth(120) gui.separator(widget=optionsBox, height=3) self.numTopicsSpin = gui.spin( widget=optionsBox, master=self, value='numTopics', minv=1, maxv=999, orientation='horizontal', label=u'Number of topics:', labelWidth=120, callback=self.sendButton.settingsChanged, keyboardTracking=False, tooltip=( u"Please select the desired number of topics in output tables." ), ) gui.separator(widget=optionsBox, height=3) gui.listBox( widget=optionsBox, master=self, labels='listEntries', tooltip=(u"TODO"), ) gui.separator(widget=self.controlArea, height=3) gui.rubber(self.controlArea) # Now Info box and Send button must be drawn... self.sendButton.draw() self.infoBox.draw() self.infoBox.setText("Widget needs input", "warning") # Send data if autoSend. self.sendButton.sendIf() self.setMinimumWidth(350) self.adjustSizeWithTimer() def input_data(self, newInput): """Process incoming data.""" self.inputTable = newInput self.infoBox.inputChanged() self.sendButton.sendIf() def send_data(self): """Compute result of widget processing and send to output""" # Check that there's a table in input... if self.inputTable is None: self.infoBox.setText("Widget needs input.", "warning") self.send("Term-topic Textable table", None) self.send("Document-topic Textable table", None) self.send("Term-topic Orange table", None) self.send("Document-topic Orange table", None) self.listEntries = list() return # Initialize progress bar. progressBar = gui.ProgressBar( self, iterations=1 # TODO ) # Convert input table to gensim dictionary. dictionary, corpus = pivot_crosstab_to_gensim(self.inputTable) # Apply topic modelling... # Case 1: LDA... if self.method == "Latent Dirichlet allocation": model = models.LdaModel( corpus, id2word=dictionary, num_topics=self.numTopics, ) # Create segment-topic PivotCrosstab table. values = dict() terms = list() for topic in xrange(self.numTopics): topic_terms = model.get_topic_terms( topic, len(self.inputTable.col_ids), ) for term, score in topic_terms: values[(dictionary[term], topic)] = score terms.append( list(dictionary[t] for t, s in topic_terms[:MAX_NUM_DISPLAYED_TERMS])) segmentTopicTable = PivotCrosstab( row_ids=self.inputTable.col_ids[:], col_ids=list(range(self.numTopics)), values=values, header_row_id='__topic__', header_row_type='continuous', header_col_id='__unit__', header_col_type='string', col_type=dict((col_id, 'continuous') for col_id in range(self.numTopics)), ) # Fill listbox... newListEntries = list() for topicNum in range(self.numTopics): displayedTerms = ", ".join(terms[topicNum]) if len(self.inputTable.col_ids) > MAX_NUM_DISPLAYED_TERMS: displayedTerms += ", ..." listEntry = "%i. %s" % ( topicNum + 1, displayedTerms, ) newListEntries.append(listEntry) self.listEntries = newListEntries # Create context-topic PivotCrosstab table... corpus_lda = model[corpus] values = dict() for row_idx, row in enumerate(self.inputTable.row_ids): lda_doc = corpus_lda[row_idx] for topic, score in lda_doc: values[(row, topic)] = score contextTopicTable = PivotCrosstab( row_ids=self.inputTable.row_ids[:], col_ids=list(range(self.numTopics)), values=values, header_row_id='__topic__', header_row_type='continuous', header_col_id='__context__', header_col_type='string', col_type=dict((col_id, 'continuous') for col_id in range(self.numTopics)), missing=0, ) # Case 2: LSI... if self.method == "Latent semantic indexing": model = models.LsiModel( corpus, id2word=dictionary, num_topics=self.numTopics, ) # Create segment-topic PivotCrosstab table. segmentTopicTable = PivotCrosstab.from_numpy( row_ids=self.inputTable.col_ids[:], col_ids=list(range(self.numTopics)), np_array=model.projection.u, header_row_id='__topic__', header_row_type='continuous', header_col_id='__unit__', header_col_type='string', col_type=dict((col_id, 'continuous') for col_id in range(self.numTopics)), ) # Fill listbox... colIds = np.array(self.inputTable.col_ids) newListEntries = list() # Subtask: compute total inertia, i.e. sum of eigenvalues of # doc-term matrix multiplied by its transposed... rect_matrix = self.inputTable.to_numpy() matrix_dims = self.inputTable.to_numpy().shape if matrix_dims[0] > matrix_dims[1]: square_matrix = np.dot(np.transpose(rect_matrix), rect_matrix) else: square_matrix = np.dot(rect_matrix, np.transpose(rect_matrix)) total_inertia = sum(np.linalg.eigvals(square_matrix)) for topicNum in range(self.numTopics): # Proportion of inertia is SQUARE of singular value divided by # total inertia, because n-th singular value = square root of # n-th eigenvalue (cf. compute total inertia above)... propInertia = model.projection.s[topicNum]**2 / total_inertia scores = model.projection.u[:, topicNum] sortedTerms = colIds[scores.argsort()[::-1]] if len(colIds) > MAX_NUM_DISPLAYED_TERMS: displayedTerms = ", ".join( sortedTerms[:MAX_NUM_DISPLAYED_TERMS // 2]) displayedTerms += ", ..., " displayedTerms += ", ".join( sortedTerms[-MAX_NUM_DISPLAYED_TERMS // 2:]) else: displayedTerms = ", ".join(sortedTerms) listEntry = "%i. (%.2f%%) %s" % ( topicNum + 1, propInertia * 100, displayedTerms, ) newListEntries.append(listEntry) self.listEntries = newListEntries # Create context-topic PivotCrosstab table... contextTopicMatrix = corpus2dense( model[corpus], len(model.projection.s)).T / model.projection.s values = dict() for row_idx, row in enumerate(contextTopicMatrix): for topic, val in enumerate(row): values[(self.inputTable.row_ids[row_idx], topic)] = val contextTopicTable = PivotCrosstab( row_ids=self.inputTable.row_ids[:], col_ids=list(range(self.numTopics)), values=values, header_row_id='__topic__', header_row_type='continuous', header_col_id='__context__', header_col_type='string', col_type=dict((col_id, 'continuous') for col_id in range(self.numTopics)), missing=0, ) # Case 2: Correspondence analysis... elif self.method == "Correspondence analysis": ca = correspondence(self.inputTable.to_numpy()) # Create segment-topic PivotCrosstab table. segmentTopicTable = PivotCrosstab.from_numpy( row_ids=self.inputTable.col_ids[:], col_ids=list(range(self.numTopics)), np_array=ca.col_factors[:, range(self.numTopics)], header_row_id='__topic__', header_row_type='continuous', header_col_id='__unit__', header_col_type='string', col_type=dict((col_id, 'continuous') for col_id in range(self.numTopics)), ) # Fill listbox... colIds = np.array(self.inputTable.col_ids) newListEntries = list() total_inertia = sum(ca.inertia_of_axis()) for topicNum in range(self.numTopics): propInertia = ca.inertia_of_axis()[topicNum] / total_inertia scores = np.array(ca.col_factors[:, topicNum]) sortedTerms = colIds[scores.argsort()[::-1]] if len(colIds) > MAX_NUM_DISPLAYED_TERMS: displayedTerms = ", ".join( sortedTerms[:MAX_NUM_DISPLAYED_TERMS // 2]) displayedTerms += ", ..., " displayedTerms += ", ".join( sortedTerms[-MAX_NUM_DISPLAYED_TERMS // 2:]) else: displayedTerms = ", ".join(sortedTerms) listEntry = "%i. (%.2f%%) %s" % ( topicNum + 1, propInertia * 100, displayedTerms, ) newListEntries.append(listEntry) self.listEntries = newListEntries # Create context-topic PivotCrosstab table. contextTopicTable = PivotCrosstab.from_numpy( row_ids=self.inputTable.row_ids[:], col_ids=list(range(self.numTopics)), np_array=ca.row_factors[:, range(self.numTopics)], header_row_id='__topic__', header_row_type='continuous', header_col_id='__unit__', header_col_type='string', col_type=dict((col_id, 'continuous') for col_id in range(self.numTopics)), ) # Set status to OK and report... self.infoBox.setText("Tables correctly sent to output.") progressBar.finish() # Clear progress bar. progressBar.finish() # Send tokens... self.send("Term-topic Textable table", segmentTopicTable) self.send("Document-topic Textable table", contextTopicTable) self.send( "Term-topic Orange table", segmentTopicTable.to_orange_table(), ) self.send( "Document-topic Orange table", contextTopicTable.to_orange_table(), ) self.sendButton.resetSettingsChangedFlag() def updateGUI(self): """Update GUI state""" if self.inputTable is not None: if (self.method == "Latent semantic indexing" or self.method == "Correspondence analysis"): maxNumTopics = min( len(self.inputTable.row_ids), len(self.inputTable.col_ids), ) self.numTopicsSpin.setRange(1, maxNumTopics - 1) else: self.numTopicsSpin.setRange(1, 999) else: self.numTopicsSpin.setRange(1, 999)
class OWTextableTextTree(OWTextableBaseWidget): """Orange widget for loading text folders""" name = "Text Tree" description = "Import data from raw text trees" icon = "icons/Textfolders.png" icon = "icons/textTree.svg" priority = 2 # Input and output channels... inputs = [('Message', JSONMessage, "inputMessage", widget.Single)] outputs = [('Text data', Segmentation)] settingsHandler = VersionedSettingsHandler( version=__version__.rsplit(".", 1)[0]) # Settings... autoSend = settings.Setting(True) folders = settings.Setting([]) encoding = settings.Setting('iso-8859-1') operation = settings.Setting('nothing') sampling = settings.Setting(100) autoNumber = settings.Setting(False) autoNumberKey = settings.Setting(u'num') importFilenames = settings.Setting(True) importFolderName = settings.Setting(True) lastLocation = settings.Setting('.') displayAdvancedSettings = settings.Setting(False) folder = settings.Setting(u'') want_main_area = False def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) # Other attributes... self.segmentation = None self.operation = "no" self.applyInclusion = False self.applyExclusion = False self.applySampling = True self.samplingRate = 100 self.createdInputs = list() self.folderLabels = list() self.selectedFolderLabels = list() self.rootFolderPath = u'' self.inclusionsUser = u'' self.exclusionsUser = u'' self.newAnnotationKey = u'' self.newAnnotationValue = u'' # self.folder is a dictionary whose keys are :'rootPath', 'maxDepth','inclusionsUser','exclusionsUser', ... # ... 'samplingRate' and 'fileList' self.folder = dict() # self.folders is a list of previously defined "self.folder" dictionaries self.folders = list() # self.inclusionList is the default inclusion list (used in minimal mode, ... # ... and in advanced mode when no inclusion has been selected) self.inclusionList = [".txt", ".html", ".xml", ".csv", ".rtf"] # self.inclusionList is the default null inclusion list (used in minimal mode, ... # ... and in advanced mode when no inclusion has been selected) self.exclusionList = [] self.infoBox = InfoBox(widget=self.controlArea) self.sendButton = SendButton( widget=self.controlArea, master=self, callback=self.sendData, infoBoxAttribute='infoBox', sendIfPreCallback=self.updateGUI, ) self.advancedSettings = AdvancedSettings( widget=self.controlArea, master=self, callback=self.sendButton.settingsChanged, ) # GUI... # Advanced settings checkbox... self.advancedSettings.draw() # BASIC GUI... # Basic folder box basicFolderBox = gui.widgetBox( widget=self.controlArea, box=u'Source', orientation='vertical', addSpace=False, ) basicFolderBoxLine1 = gui.widgetBox( widget=basicFolderBox, box=False, orientation='horizontal', ) gui.lineEdit( widget=basicFolderBoxLine1, master=self, value='rootFolderPath', orientation='horizontal', label=u'Folder path:', labelWidth=101, callback=self.add, tooltip=(u"The path of the folder."), ) gui.separator(widget=basicFolderBoxLine1, width=5) gui.button( widget=basicFolderBoxLine1, master=self, label=u'Browse', callback=self.browse, tooltip=(u"Open a dialog for selecting a top folder."), ) gui.separator(widget=basicFolderBox, width=3) self.advancedSettings.basicWidgets.append(basicFolderBox) self.advancedSettings.basicWidgetsAppendSeparator() # ADVANCED GUI... # folder box folderBox = gui.widgetBox( widget=self.controlArea, box=u'Sources', orientation='vertical', addSpace=False, ) folderBoxLine1 = gui.widgetBox( widget=folderBox, box=False, orientation='horizontal', addSpace=True, ) self.folderListbox = gui.listBox( widget=folderBoxLine1, master=self, value='selectedFolderLabels', labels='folderLabels', callback=self.updatefolderBoxButtons, tooltip=(u"The list of folders whose content will be imported.\n" u"\nIn the output segmentation, the content of each\n" u"folder appears in the same position as in the list.\n" u"\nColumn 1 shows the folder's name.\n" u"Column 2 shows the folder's depth.\n" u"Column 3 shows the inclusions filter.\n" u"Column 4 shows the exclusions filter.\n" u"Column 5 shows the folder's level of sampling."), ) font = QFont() font.setFamily('Courier') font.setStyleHint(QFont.Courier) font.setPixelSize(12) self.folderListbox.setFont(font) folderBoxCol2 = gui.widgetBox( widget=folderBoxLine1, orientation='vertical', ) self.moveUpButton = gui.button( widget=folderBoxCol2, master=self, label=u'Move Up', callback=self.moveUp, tooltip=(u"Move the selected folder upward in the list."), ) self.moveDownButton = gui.button( widget=folderBoxCol2, master=self, label=u'Move Down', callback=self.moveDown, tooltip=(u"Move the selected folder downward in the list."), ) self.removeButton = gui.button( widget=folderBoxCol2, master=self, label=u'Remove', callback=self.remove, tooltip=(u"Remove the selected folder from the list."), ) self.clearAllButton = gui.button( widget=folderBoxCol2, master=self, label=u'Clear All', callback=self.clearAll, tooltip=(u"Remove all folders from the list."), ) self.exportButton = gui.button( widget=folderBoxCol2, master=self, label=u'', callback=self.exportList, disabled=True, tooltip=(u"Open a dialog for selecting a folder where the folder\n" u"list can be exported in JSON format."), ) self.importButton = gui.button( widget=folderBoxCol2, master=self, label=u'', callback=self.importList, disabled=True, tooltip=(u"Open a dialog for selecting a folder list to\n" u"import (in JSON format). folders from this list\n" u"will be added to those already imported."), ) folderBoxLine2 = gui.widgetBox( widget=folderBox, box=False, orientation='vertical', ) # Add folder box addFolderBox = gui.widgetBox( widget=folderBoxLine2, box=True, orientation='vertical', ) addFolderBoxLine1 = gui.widgetBox( widget=addFolderBox, orientation='horizontal', ) # Folder path input gui.lineEdit( widget=addFolderBoxLine1, master=self, value='rootFolderPath', orientation='horizontal', label=u'Folder path:', labelWidth=101, callback=self.updateGUI, tooltip=(u"The paths of the folders that will be added to the\n" u"list when button 'Add' is clicked.\n\n" u"Successive paths must be separated with ' / ' \n" u"(whitespace + slash + whitespace). Their order in\n" u"the list will be the same as in this field."), ) gui.separator(widget=addFolderBoxLine1, width=5) # Button Browse gui.button( widget=addFolderBoxLine1, master=self, label=u'Browse', callback=self.browse, tooltip=(u"Open a dialog for selecting a top folder.\n\n" u"Selected folder paths will appear in the field to\n" u"the left of this button afterwards, ready to be\n" u"added to the list when button 'Add' is clicked."), ) gui.separator(widget=addFolderBox, width=10) # Filter box to input include gui.separator(widget=addFolderBox, width=3) includeBoxLine1 = gui.widgetBox( widget=addFolderBox, box=False, orientation='horizontal', ) # Include box gui.checkBox( widget=includeBoxLine1, master=self, value='applyInclusion', label=u'Include', labelWidth=100, callback=lambda: includeLineEdit.setDisabled(not self. applyInclusion), tooltip=(u"Choose the inclusion(s)"), ) includeLineEdit = gui.lineEdit( widget=includeBoxLine1, master=self, value='inclusionsUser', orientation='horizontal', label=u'', disabled=True, labelWidth=101, tooltip=(u"This field lets you specify a custom filter\n" u"to select the folders to be\n" u"added to the list."), ) # Filter box to exclude gui.separator(widget=addFolderBox, width=3) excludeBoxLine1 = gui.widgetBox( widget=addFolderBox, box=False, orientation='horizontal', ) # Exclude box gui.checkBox( widget=excludeBoxLine1, master=self, value='applyExclusion', label=u'Exclude', labelWidth=100, disabled=False, callback=lambda: includeLineEdit2.setDisabled(not self. applyExclusion), tooltip=(u"Exclude the inclusion(s)"), ) includeLineEdit2 = gui.lineEdit( widget=excludeBoxLine1, master=self, value='exclusionsUser', orientation='horizontal', label=u'', disabled=True, labelWidth=101, tooltip=(u"This field lets you specify a custom filter\n" u"to select the folders to be\n" u"added to the list."), ) # Sampling box to input the level of sampling gui.separator(widget=addFolderBox, width=3) samplingBoxLine1 = gui.widgetBox( widget=addFolderBox, box=False, orientation='horizontal', ) # Check box for sampling gui.checkBox( widget=samplingBoxLine1, master=self, value='applySampling', label=u'Sampling', labelWidth=100, disabled=False, callback=lambda: samplingSpin.setDisabled(not self.applySampling), tooltip=(u"Choose the sampling level"), ) samplingSpin = gui.spin( widget=samplingBoxLine1, master=self, value='samplingRate', minv=10, maxv=100, labelWidth=50, orientation='horizontal', tooltip=(u"sampling level"), ) gui.separator(widget=addFolderBox, width=3) self.addButton = gui.button( widget=addFolderBox, master=self, label=u'Add', callback=self.add, tooltip=(u"Add the folder(s) currently displayed in the\n" u"'folders' text field to the list.\n\n" u"Each of these folders will be associated with the\n" u"specified encoding and annotation (if any).\n\n" u"Other folders may be selected afterwards and\n" u"assigned a different encoding and annotation."), ) self.advancedSettings.advancedWidgets.append(folderBox) self.advancedSettings.advancedWidgetsAppendSeparator() # Options box... optionsBox = gui.widgetBox( widget=self.controlArea, box=u'Options', orientation='vertical', addSpace=False, ) optionsBoxLine1 = gui.widgetBox( widget=optionsBox, box=False, orientation='horizontal', ) gui.separator(widget=optionsBox, width=3) optionsBoxLine2 = gui.widgetBox( widget=optionsBox, box=False, orientation='horizontal', ) gui.checkBox( widget=optionsBoxLine2, master=self, value='autoNumber', label=u'Auto-number with key:', labelWidth=180, callback=self.sendButton.settingsChanged, tooltip=(u"Annotate folders with increasing numeric indices."), ) self.autoNumberKeyLineEdit = gui.lineEdit( widget=optionsBoxLine2, master=self, value='autoNumberKey', orientation='horizontal', callback=self.sendButton.settingsChanged, tooltip=(u"Annotation key for folder auto-numbering."), ) gui.separator(widget=optionsBox, width=3) self.advancedSettings.advancedWidgets.append(optionsBox) self.advancedSettings.advancedWidgetsAppendSeparator() gui.rubber(self.controlArea) # Send button... self.sendButton.draw() # Info box... self.infoBox.draw() self.adjustSizeWithTimer() QTimer.singleShot(0, self.sendButton.sendIf) def inputMessage(self, message): """Handle JSON message on input connection""" if not message: return self.displayAdvancedSettings = True self.advancedSettings.setVisible(True) self.clearAll() self.infoBox.inputChanged() try: jsonData = json.loads(message.content) tempFolders = list() for entry in jsonData: path = entry.get('path', '') encoding = entry.get('encoding', '') annotationKey = entry.get('annotation_key', '') annotationValue = entry.get('annotation_value', '') if path == '' or encoding == '': self.infoBox.setText( u"Please verify keys and values of incoming " u"JSON message.", 'error') self.send('Text data', None, self) return depth = "0" options = "[i]:{unicorn}" tempFolders.append(( name, path, depth, options, )) self.folders.extend(tempFolders) self.sendButton.settingsChanged() except ValueError: self.infoBox.setText( u"Please make sure that incoming message is valid JSON.", 'error') self.send('Text data', None, self) return def sendData(self): """Load folders, create and send segmentation""" # Check that there's something on input... if (self.displayAdvancedSettings and not self.folders) or not (self.rootFolderPath or self.displayAdvancedSettings): self.infoBox.setText(u'Please select input folder.', 'warning') self.send('Text data', None, self) return # Check that autoNumberKey is not empty (if necessary)... if self.displayAdvancedSettings and self.autoNumber: if self.autoNumberKey: autoNumberKey = self.autoNumberKey else: self.infoBox.setText( u'Please enter an annotation key for auto-numbering.', 'warning') self.send('Text data', None, self) return else: autoNumberKey = None # Clear created Inputs... self.clearCreatedInputs() annotations = list() counter = 1 if self.displayAdvancedSettings: myFolders = self.folders else: myFolders = [self.folder] # Annotations... allFileListContent = list() for myFolder in myFolders: myFiles = myFolder['fileList'] for myFile in myFiles: annotation = dict() annotation['file name'] = myFile['fileName'] annotation['file depth level'] = myFile['depthLvl'] annotation['file path'] = myFile['absoluteFilePath'] try: annotation['file encoding, confidence'] = myFile[ 'encoding'] + ", " + str(myFile['encodingConfidence']) except TypeError: annotation['file encoding, confidence'] = "unknown" depths = [k for k in myFile.keys() if k.startswith('depth_')] for depth in depths: annotation[depth] = myFile[depth] annotations.append(annotation) allFileListContent.append(myFile['fileContent']) # Create an LTTL.Input for each files... if len(allFileListContent) == 1: label = self.captionTitle else: label = None for index in range(len(allFileListContent)): myInput = Input(allFileListContent[index], label) segment = myInput[0] segment.annotations.update(annotations[index]) myInput[0] = segment self.createdInputs.append(myInput) # If there's only one file, the widget's output is the created Input. if len(allFileListContent) == 1: self.segmentation = self.createdInputs[0] # Otherwise the widget's output is a concatenation... else: self.segmentation = Segmenter.concatenate( segmentations=self.createdInputs, label=self.captionTitle, copy_annotations=True, import_labels_as=None, sort=False, auto_number_as=None, merge_duplicates=False, progress_callback=None, ) message = u'%i segment@p sent to output ' % len(self.segmentation) message = pluralize(message, len(self.segmentation)) numChars = 0 for segment in self.segmentation: segmentLength = len(Segmentation.get_data(segment.str_index)) numChars += segmentLength message += u'(%i character@p).' % numChars message = pluralize(message, numChars) self.infoBox.setText(message) self.send('Text data', self.segmentation, self) self.sendButton.resetSettingsChangedFlag() def clearCreatedInputs(self): for i in self.createdInputs: Segmentation.set_data(i[0].str_index, None) del self.createdInputs[:] def importList(self): """Display a folderDialog and import folder list""" folderPath = QFileDialog.getOpenFileName(self, u'Import folder List', self.lastLocation, u'Text folders (*)') if not folderPath: return self.rootFolderPath = os.path.normpath(folderPath) self.lastLocation = os.path.dirname(folderPath) self.error() try: folderHandle = codecs.open(folderPath, encoding='utf8') folderContent = folderHandle.read() folderHandle.close() except IOError: QMessageBox.warning(None, 'Textable', "Couldn't open folder.", QMessageBox.Ok) return try: jsonData = json.loads(folderContent) tempFolders = list() for entry in jsonData: path = entry.get('path', '') encoding = entry.get('encoding', '') annotationKey = entry.get('annotation_key', '') annotationValue = entry.get('annotation_value', '') if path == '' or encoding == '': QMessageBox.warning( None, 'Textable', "Selected JSON folder doesn't have the right keys " "and/or values.", QMessageBox.Ok) return tempFolders.append(( path, encoding, annotationKey, annotationValue, )) self.folders.extend(tempFolders) if tempFolders: self.sendButton.settingsChanged() except ValueError: QMessageBox.warning(None, 'Textable', "JSON parsing error.", QMessageBox.Ok) return def exportList(self): """Display a folderDialog and export folder list""" toDump = list() myFolders = self.folders for myFolder in myFolders: toDump.append({ 'path': myFolder[0], 'encoding': myFolder[1], }) if myFolder[2] and myFolder[3]: toDump[-1]['annotation_key'] = myFolder[2] toDump[-1]['annotation_value'] = myFolder[3] folderPath = QFileDialog.getSaveFileName( self, u'Export folder List', self.lastLocation, ) if folderPath: self.lastLocation = os.path.dirname(folderPath) outputfolder = codecs.open( folderPath, encoding='utf8', mode='w', errors='xmlcharrefreplace', ) outputfolder.write( normalizeCarriageReturns( json.dumps(toDump, sort_keys=True, indent=4))) outputfolder.close() QMessageBox.information(None, 'Textable', 'folder list correctly exported', QMessageBox.Ok) def getFileList(self): initialRootParentPath, _ = os.path.split( # self.rootFolderPath is the initially selected's folder parent self.rootFolderPath) fileList = list() # fileListExt is a list of files matching default extension fileListExt = list() depthList = list() progressBarZero = gui.ProgressBar(self, iterations=1) # Using os.walk to walk through directories : # Variables descriptions : # currPath is a STRING, the path to the directory. # dirNames is a LIST of the names of subdirectories. # fileNames is a LIST of the names of the files in currPath # symlink are not considered in this analysis for currPath, dirNames, fileNames in os.walk(self.rootFolderPath): currRelPath = currPath[ len(initialRootParentPath) + 1:] # defines current relative path by similar initial parent path part currRelPathList = os.path.normpath(currRelPath).split( os.sep) # splits current relative path by os separator for fileName in fileNames: # file dict is a dictionary of the file's informations will get following keys : # file = { # "absoluteFilePath", # "fileName", # "depth_0", # "depth_X" # depthLvl", # "fileContent" # } # 'fileContent','encoding' and 'encodingConfidence' keys are defined when function "openFileList" is called file = dict() # Initial annotations correspond different subfolders browsed by each depth level (used for depth_X annot.) annotations = currRelPathList[:] currDepth = len(annotations) - 1 depthList.append(currDepth) file['absoluteFilePath'] = os.path.join(currPath, fileName) file['fileName'] = fileName file['depthLvl'] = currDepth file['depth_0'] = annotations[0] # Created an annotation by depth level, corresponding to folder names for i in range(1, currDepth + 1): file['depth_' + str(i)] = annotations[i] # Apply default file extension filter for extension in self.inclusionList: if fileName.endswith(extension): # FileListExt = file list created with default inclusion criteria (text extensions from inclusionList) fileListExt.append(file) fileList.append(file) # apply inclusion filter if self.applyInclusion: fileListIncl = [ file for file in fileList # match in inclusion list if self.match(file['fileName'], self.inclusionsUserAsList) ] else: fileListIncl = fileListExt # apply exclusion filter if self.applyExclusion: fileListExcl = [ file for file in fileListIncl # no match in exclusion list if not self.match(file['fileName'], self.exclusionsUserAsList) ] else: fileListExcl = fileListIncl # output file list self.fileList = fileListExcl if self.fileList: self.maxDepth = max(depthList) self.fileList = self.sampleFileList() self.openFileList() else: self.maxDepth = 0 progressBarZero.finish() # Test if file contains one of the patterns in patternList def match(self, file, patternList): for pattern in patternList: if pattern in file: return True return False def openFileList(self): tempFileList = list() progressBarOpen = gui.ProgressBar(self, iterations=len(self.fileList)) for file in self.fileList: fileContent = "" try: filePath = file['absoluteFilePath'] except TypeError: pass encodings = getPredefinedEncodings() try: with open(filePath, 'rb') as openedFile: fileContent = openedFile.read() charsetDict = chardet.detect(fileContent) detectedEncoding = charsetDict['encoding'] detectedConfidence = charsetDict['confidence'] # Chunking functionnality should be added here try: encodings.remove(detectedEncoding) encodings.insert(0, detectedEncoding) except ValueError: pass for encoding in encodings: try: self.fileContent = fileContent.decode(encoding) except: pass file['encoding'] = detectedEncoding file['fileContent'] = self.fileContent file['encodingConfidence'] = detectedConfidence progressBarOpen.advance() tempFileList.append(file) except IOError: if len(myFiles) > 1: message = u"Couldn't open file '%s'." % filePath else: message = u"Couldn't open file." self.infoBox.setText(message, 'error') self.send('Text data', None, self) return self.fileList = tempFileList self.folder = { 'rootPath': self.rootFolderPath, 'maxDepth': self.maxDepth, 'inclusionsUser': self.inclusionsUser, 'exclusionsUser': self.exclusionsUser, 'samplingRate': self.samplingRate, 'fileList': self.fileList } progressBarOpen.finish() def browse(self): """Display a QFileDialog and select a folder""" rootFolderPath = QFileDialog.getExistingDirectory( self, u'Select Folder(s)', self.lastLocation, ) if not rootFolderPath: return rootFolderPath = os.path.normpath(rootFolderPath) self.rootFolderPath = rootFolderPath self.lastLocation = rootFolderPath if self.displayAdvancedSettings: pass else: self.getFileList() self.folder = { 'rootPath': self.rootFolderPath, 'maxDepth': self.maxDepth, 'fileList': self.fileList, } self.sendButton.settingsChanged() self.updateGUI() def moveUp(self): """Move folder upward in folders listbox""" if self.selectedFolderLabels: index = self.selectedFolderLabels[0] if index > 0: temp = self.folders[index - 1] self.folders[index - 1] = self.folders[index] self.folders[index] = temp self.selectedFolderLabels.listBox.item(index - 1).setSelected(1) self.sendButton.settingsChanged() def moveDown(self): """Move folder downward in folders listbox""" if self.selectedFolderLabels: index = self.selectedFolderLabels[0] if index < len(self.folders) - 1: temp = self.folders[index + 1] self.folders[index + 1] = self.folders[index] self.folders[index] = temp self.selectedFolderLabels.listBox.item(index + 1).setSelected(1) self.sendButton.settingsChanged() def clearAll(self): """Remove all folders from folders attr""" del self.folders[:] del self.selectedFolderLabels[:] self.sendButton.settingsChanged() def remove(self): """Remove folder from folders attr""" if self.selectedFolderLabels: index = self.selectedFolderLabels[0] self.folders.pop(index) del self.selectedFolderLabels[:] self.sendButton.settingsChanged() def add(self): """Add folders to folders attr""" # Identify sequences separated by a comma (,) and deletes existing whitespaces self.inclusionsUserAsList = [ x.strip() for x in self.inclusionsUser.split(",") if x.strip() ] self.exclusionsUserAsList = [ x.strip() for x in self.exclusionsUser.split(",") if x.strip() ] # Calling the GetFileList function returns a self.fileList list of all files corresponding to either defaults # or optional settings self.getFileList() self.folders.append(self.folder) self.sendButton.settingsChanged() def sampleFileList(self): myList = list(self.fileList) # Sampling rate from input allows calculation of the sampling percentage samplePercentage = self.samplingRate / 100.0 # The initial list is shuffled so that files from all folders can be picked randomly random.shuffle(myList) # Files are picked randomly from the previously shuffled list nOfFiles = int(math.ceil(len(myList) * samplePercentage)) return myList[:nOfFiles] def updateGUI(self): """Update GUI state""" if self.displayAdvancedSettings: if self.selectedFolderLabels: cachedLabel = self.selectedFolderLabels[0] else: cachedLabel = None del self.folderLabels[:] folderLabels = [] if self.folders: folderRootPathsList = [f['rootPath'] for f in self.folders] maxDepthList = ['%s' % f['maxDepth'] for f in self.folders] inclusionsUserList = [ f['inclusionsUser'] for f in self.folders ] exclusionsUserList = [ f['exclusionsUser'] for f in self.folders ] samplingRatesList = [ '%s' % f['samplingRate'] for f in self.folders ] folderNamesList = [ os.path.basename(p) for p in folderRootPathsList ] maxFolderNameLen = max([len(n) for n in folderNamesList]) for index in range(len(self.folders)): format = u'%-' + str(maxFolderNameLen + 2) + u's' folderLabel = format % folderNamesList[index] folderLabel += "[d]:{" + maxDepthList[index] + "} " folderLabel += "[i]:{" + inclusionsUserList[index] + "} " folderLabel += "[e]:{" + exclusionsUserList[index] + "} " folderLabel += "[s]:{" + samplingRatesList[index] + "%}" folderLabels.append(folderLabel) self.folderLabels = folderLabels if cachedLabel is not None: self.sendButton.sendIfPreCallback = None self.selectedFolderLabels.listBox.item( cachedLabel).setSelected(1) self.sendButton.sendIfPreCallback = self.updateGUI if self.rootFolderPath: if (self.newAnnotationKey and self.newAnnotationValue) or ( not self.newAnnotationKey and not self.newAnnotationValue): self.addButton.setDisabled(False) else: self.addButton.setDisabled(True) else: self.addButton.setDisabled(True) if self.autoNumber: self.autoNumberKeyLineEdit.setDisabled(False) else: self.autoNumberKeyLineEdit.setDisabled(True) self.updatefolderBoxButtons() self.advancedSettings.setVisible(True) else: self.advancedSettings.setVisible(False) def updatefolderBoxButtons(self): """Update state of folder box buttons""" if self.selectedFolderLabels: self.removeButton.setDisabled(False) if self.selectedFolderLabels[0] > 0: self.moveUpButton.setDisabled(False) else: self.moveUpButton.setDisabled(True) if self.selectedFolderLabels[0] < len(self.folders) - 1: self.moveDownButton.setDisabled(False) else: self.moveDownButton.setDisabled(True) else: self.moveUpButton.setDisabled(True) self.moveDownButton.setDisabled(True) self.removeButton.setDisabled(True) if len(self.folders): self.clearAllButton.setDisabled(False) self.exportButton.setDisabled(True) else: self.clearAllButton.setDisabled(True) self.exportButton.setDisabled(True) def setCaption(self, title): if 'captionTitle' in dir(self): changed = title != self.captionTitle super().setCaption(title) if changed: self.sendButton.settingsChanged() else: super().setCaption(title) def onDeleteWidget(self): self.clearCreatedInputs()
class LyricsGenius(OWTextableBaseWidget): """Textable widget for importing JSON data from the website Genius (https://genius.com/) """ #---------------------------------------------------------------------- # Widget's metadata... name = "Lyrics Genius" description = "Lyrics importation" icon = "icons/LyricsGenius.svg" priority = 10 #---------------------------------------------------------------------- # Channel definitions... inputs = [] outputs = [("Lyrics importation", Segmentation)] #---------------------------------------------------------------------- # Layout parameters... want_main_area = False #---------------------------------------------------------------------- # Settings... settingsHandler = VersionedSettingsHandler( version=__version__.rsplit(".", 1)[0]) # Saved settings autoSend = settings.Setting(True) myBasket = settings.Setting([]) def __init__(self): """Widget creator.""" super().__init__() # ATTRIBUTS # searchFunction self.searchResults = None self.inputSeg = None # newQuery = attribut box lineEdit (search something) self.newQuery = '' self.nbr_results = 10 # Results box attributs self.titleLabels = list() self.selectedTitles = list() # selections box attributs self.myTitles = list() self.mytitleLabels = list() # stock all the inputs (songs) in a list self.createdInputs = list() # Next two instructions are helpers from TextableUtils. Corresponding # interface elements are declared here and actually drawn below (at # their position in the UI)... self.infoBox = InfoBox(widget=self.controlArea) self.sendButton = SendButton( widget=self.controlArea, master=self, callback=self.sendData, infoBoxAttribute="infoBox", ) #---------------------------------------------------------------------- # User interface... # Create the working area queryBox = gui.widgetBox( widget=self.controlArea, box="Search songs", orientation="vertical", ) # Allows to enter specific text to the research # Uses "newQuery" attribut gui.lineEdit( widget=queryBox, master=self, value='newQuery', orientation='horizontal', label=u"Query: ", labelWidth=120, tooltip=("Enter a string"), ) # Allows to choose the wanted results numberp (10 by 10) queryNbr = gui.comboBox( widget=queryBox, master=self, value="nbr_results", items=[ "10", "20", "30", "50", "60", "70", "80", "90", "100", ], sendSelectedValue=True, orientation="horizontal", label="Number of results: ", labelWidth=120, tooltip=("Please select the desired search.\n"), ) # Reasearch button # Uses "searchFunction" attribut self.searchButton = gui.button( widget=queryBox, master=self, label="Search", callback=self.searchFunction, tooltip="Connect Genius and make a research", ) self.titleListbox = gui.listBox( widget=queryBox, master=self, value="selectedTitles", # setting (list) labels="titleLabels", # setting (list) callback=lambda: self.addButton.setDisabled(self.selectedTitles == list()), tooltip="The list of titles whose content will be imported", ) self.titleListbox.setMinimumHeight(150) self.titleListbox.setSelectionMode(3) boxbutton = gui.widgetBox( widget=queryBox, box=False, orientation='horizontal', ) # Add songs button self.addButton = gui.button( widget=boxbutton, master=self, label=u'Add to corpus', callback=self.add, tooltip=(u"Move the selected song downward in your corpus."), ) self.addButton.setDisabled(True) # Clear button # Uses "clearResults" function self.clearButton = gui.button( widget=boxbutton, master=self, label="Clear results", callback=self.clearResults, tooltip="Clear results", ) self.clearButton.setDisabled(True) gui.separator(widget=queryBox, height=3) # area where confirmed songs are moved and stocked mytitleBox = gui.widgetBox( widget=self.controlArea, box="Corpus", orientation="vertical", ) self.mytitleListbox = gui.listBox( widget=mytitleBox, master=self, value="myTitles", labels="mytitleLabels", callback=lambda: self.removeButton.setDisabled(self.myTitles == list()), tooltip="The list of titles whose content will be imported", ) self.mytitleListbox.setMinimumHeight(150) self.mytitleListbox.setSelectionMode(3) boxbutton2 = gui.widgetBox( widget=mytitleBox, box=False, orientation='horizontal', ) # Remove songs button self.removeButton = gui.button( widget=boxbutton2, master=self, label=u'Remove from corpus', callback=self.remove, tooltip=(u"Remove the selected song from your corpus."), ) self.removeButton.setDisabled(True) # Delete all confirmed songs button self.clearmyBasket = gui.button( widget=boxbutton2, master=self, label=u'Clear corpus', callback=self.clearmyBasket, tooltip=(u"Remove all songs from your corpus."), ) self.clearmyBasket.setDisabled(True) gui.separator(widget=mytitleBox, height=3) gui.rubber(self.controlArea) #---------------------------------------------------------------------- # Draw Info box and Send button self.sendButton.draw() self.searchButton.setDefault(True) self.infoBox.draw() # Update the selections list self.updateMytitleLabels() # Send data if autoSend. self.sendButton.sendIf() # Search function which contacts the Genius API def searchFunction(self): """Search from website Genius""" result_list = {} query_string = self.newQuery if query_string != "": page = 1 page_max = int(self.nbr_results) / 10 result_id = 0 result_artist = [] self.controlArea.setDisabled(True) # Initialize progress bar. progressBar = ProgressBar(self, iterations=page_max) while page <= page_max: values = {'q': query_string, 'page': page} data = urllib.parse.urlencode(values) query_url = 'http://api.genius.com/search?' + data json_obj = self.url_request(query_url) body = json_obj["response"]["hits"] # Each result is stored in a dictionnary with its title, # artist's name, artist's ID and URL path for result in body: result_id += 1 title = result["result"]["title"] artist = result["result"]["primary_artist"]["name"] artist_id = result["result"]["primary_artist"]["id"] path = result["result"]["path"] result_list[result_id] = { 'artist': artist, 'artist_id': artist_id, 'path': path, 'title': title } page += 1 # 1 tick on the progress bar of the widget progressBar.advance() # Stored the results list in the "result_list" variable self.searchResults = result_list # Reset and clear the visible widget list del self.titleLabels[:] # Update the results list with the search results # in order to display them for idx in self.searchResults: result_string = self.searchResults[idx]["title"] + " - " + \ self.searchResults[idx]["artist"] self.titleLabels.append(result_string) self.titleLabels = self.titleLabels self.clearButton.setDisabled(False) self.addButton.setDisabled(self.selectedTitles == list()) # Clear progress bar. progressBar.finish() self.controlArea.setDisabled(False) else: self.infoBox.setText("You didn't search anything", "warning") # Function contacting the Genius API and returning JSON objects def url_request(self, url): """Opens a URL and returns it as a JSON object""" # Token to use the Genius API. DO NOT CHANGE. ACCESS_TOKEN = "PNlSRMxGK1NqOUBelK32gLirqAtWxPzTey" \ "9pReIjzNiVKbHBrn3o59d5Zx7Yej8g" USER_AGENT = "CompuServe Classic/1.22" request = urllib.request.Request(url, headers={ "Authorization": "Bearer " + ACCESS_TOKEN, "User-Agent": USER_AGENT }) response = urllib.request.urlopen(request) raw = response.read().decode('utf-8') json_obj = json.loads(raw) # retourne un objet json return json_obj # Function converting HTML to string def html_to_text(self, page_url): """Extracts the lyrics (as a string) of the html page""" page = requests.get(page_url) html = BeautifulSoup(page.text, "html.parser") [h.extract() for h in html('script')] lyrics = html.find("div", class_="lyrics").get_text() lyrics.replace('\\n', '\n') # return a string return lyrics # Function clearing the results list def clearResults(self): """Clear the results list""" del self.titleLabels[:] self.titleLabels = self.titleLabels self.clearButton.setDisabled(True) self.addButton.setDisabled(self.titleLabels == list()) # Add songs function def add(self): """Add songs in your selection """ for selectedTitle in self.selectedTitles: songData = self.searchResults[selectedTitle + 1] if songData not in self.myBasket: self.myBasket.append(songData) self.updateMytitleLabels() self.sendButton.settingsChanged() # Update selections function def updateMytitleLabels(self): self.mytitleLabels = list() for songData in self.myBasket: result_string = songData["title"] + " - " + songData["artist"] self.mytitleLabels.append(result_string) self.mytitleLabels = self.mytitleLabels self.clearmyBasket.setDisabled(self.myBasket == list()) self.removeButton.setDisabled(self.myTitles == list()) # fonction qui retire la selection de notre panier def remove(self): """Remove the selected songs in your selection """ self.myBasket = [ song for idx, song in enumerate(self.myBasket) if idx not in self.myTitles ] self.updateMytitleLabels() self.sendButton.settingsChanged() # Clear selections function def clearmyBasket(self): """Remove all songs in your selection """ self.mytitleLabels = list() self.myBasket = list() self.sendButton.settingsChanged() self.clearmyBasket.setDisabled(True) # Function computing results then sending them to the widget output def sendData(self): """Compute result of widget processing and send to output""" # Skip if title list is empty: if self.myBasket == list(): self.infoBox.setText( "Your corpus is empty, please add some songs first", "warning") return # Clear created Inputs. self.clearCreatedInputs() self.controlArea.setDisabled(True) # Initialize progress bar. progressBar = ProgressBar(self, iterations=len(self.myBasket)) # Attempt to connect to Genius and retrieve lyrics... selectedSongs = list() song_content = list() annotations = list() try: for song in self.myBasket: # song is a dict {'idx1':{'title':'song1'...}, # 'idx2':{'title':'song2'...}} page_url = "http://genius.com" + song['path'] lyrics = self.html_to_text(page_url) song_content.append(lyrics) annotations.append(song.copy()) # 1 tick on the progress bar of the widget progressBar.advance() # If an error occurs (e.g. http error, or memory error)... except: # Set Info box and widget to "error" state. self.infoBox.setText("Couldn't download data from Genius website.", "error") self.controlArea.setDisabled(False) return # Store downloaded lyrics strings in input objects... for song in song_content: newInput = Input(song, self.captionTitle) self.createdInputs.append(newInput) # If there"s only one play, the widget"s output is the created Input. if len(self.createdInputs) == 1: self.segmentation = self.createdInputs[0] # Otherwise the widget"s output is a concatenation... else: self.segmentation = Segmenter.concatenate( self.createdInputs, self.captionTitle, import_labels_as=None, ) # Annotate segments... for idx, segment in enumerate(self.segmentation): segment.annotations.update(annotations[idx]) self.segmentation[idx] = segment # Clear progress bar. progressBar.finish() self.controlArea.setDisabled(False) # Set status to OK and report data size... message = "%i segment@p sent to output " % len(self.segmentation) message = pluralize(message, len(self.segmentation)) numChars = 0 for segment in self.segmentation: segmentLength = len(Segmentation.get_data(segment.str_index)) numChars += segmentLength message += "(%i character@p)." % numChars message = pluralize(message, numChars) self.infoBox.setText(message) self.send("Lyrics importation", self.segmentation, self) self.sendButton.resetSettingsChangedFlag() def clearCreatedInputs(self): """Delete all Input objects that have been created.""" for i in self.createdInputs: Segmentation.set_data(i[0].str_index, None) del self.createdInputs[:] # The following method needs to be copied verbatim in # every Textable widget that sends a segmentation... def setCaption(self, title): if 'captionTitle' in dir(self): changed = title != self.captionTitle super().setCaption(title) if changed: self.sendButton.settingsChanged() else: super().setCaption(title)
class Childes(OWTextableBaseWidget): """Textable widget for importing data in XML format from the CHILDES database (https://childes.talkbank.org/data-xml/). """ #---------------------------------------------------------------------- # Widget's metadata... name = "CHILDES" description = "Import XML data from the CHILDES database" icon = "icons/CHILDES.svg" priority = 15 #---------------------------------------------------------------------- # Channel definitions (NB: no input in this case)... inputs = [] outputs = [ ("Files", Segmentation, widget.Default), ("Utterances", Segmentation), ("Words", Segmentation), ] #---------------------------------------------------------------------- # Settings... settingsHandler = VersionedSettingsHandler( version=__version__.rsplit(".", 1)[0]) importedCorpora = settings.Setting(list()) outputUtterances = settings.Setting(False) outputWords = settings.Setting(False) includePrefixes = settings.Setting(False) includePOSTag = settings.Setting(False) autoSend = settings.Setting(False) #---------------------------------------------------------------------- # Other class variables... baseUrl = "https://childes.talkbank.org/data-xml/" cacheFilename = "cache_childes" cachedFoldername = "cached_childes_corpora" want_main_area = False def __init__(self): """Widget creator.""" super().__init__() # Other (non-setting) attributes... self.fileSegmentation = None self.createdInputs = list() self.displayedFolderLabels = list() self.currentFolder = self.__class__.baseUrl self.database = None self.selectedInDisplayedFolder = list() self.selectedInSelection = list() self.selectionLabels = list() # Next two instructions are helpers from TextableUtils. Corresponding # interface elements are declared here and actually drawn below (at # their position in the UI)... self.infoBox = InfoBox(widget=self.controlArea) self.sendButton = SendButton( widget=self.controlArea, master=self, callback=self.sendData, infoBoxAttribute="infoBox", ) # User interface... # Browse database box browseBox = gui.widgetBox( widget=self.controlArea, box="Browse database", orientation="vertical", addSpace=False, ) self.currentFolderLabel = gui.label( widget=browseBox, master=self, label="Current folder: /", tooltip="This is the currently displayed folder.", ) gui.separator(widget=browseBox, height=3) upwardNavBox = gui.widgetBox( widget=browseBox, box=False, orientation="horizontal", ) self.homeRefreshButton = gui.button( widget=upwardNavBox, master=self, label="Home", callback=self.homeRefreshPressed, ) self.backButton = gui.button( widget=upwardNavBox, master=self, label="Back", callback=self.backPressed, tooltip="View parent folder.", ) gui.separator(widget=browseBox, height=3) displayedFolderListbox = gui.listBox( widget=browseBox, master=self, value="selectedInDisplayedFolder", labels="displayedFolderLabels", callback=self.corpusSelected, tooltip="Select an item to open or import.", ) displayedFolderListbox.setMinimumHeight(120) displayedFolderListbox.setSelectionMode(3) displayedFolderListbox.doubleClicked.connect( self.displayedFoldersDoubleClicked) downwardNavBox = gui.widgetBox( widget=browseBox, box=False, orientation="horizontal", ) self.openButton = gui.button( widget=downwardNavBox, master=self, label="Open", callback=self.openPressed, tooltip="View selected folder's contents.", ) self.addButton = gui.button( widget=downwardNavBox, master=self, label="Add to selection", callback=self.addPressed, tooltip="Add selected items contents to selection.", ) gui.separator(widget=browseBox, height=3) # Selection box... selectionBox = gui.widgetBox( widget=self.controlArea, box="Selection", orientation="vertical", addSpace=False, ) selectionListbox = gui.listBox( widget=selectionBox, master=self, value="selectedInSelection", labels="selectionLabels", callback=lambda: self.removeButton.setDisabled( # TODO move self.selectedInSelection == list()), tooltip="The list of corpora whose content will be imported", ) selectionListbox.setMinimumHeight(120) selectionListbox.setSelectionMode(3) selectionListbox.doubleClicked.connect(self.selectionDoubleClicked) removalBox = gui.widgetBox( widget=selectionBox, box=False, orientation="horizontal", ) self.removeButton = gui.button( widget=removalBox, master=self, label="Remove from selection", callback=self.removePressed, tooltip="Remove the selected corpus.", ) self.clearButton = gui.button( widget=removalBox, master=self, label="Clear selection", callback=self.clearPressed, tooltip="Remove all corpora from selection.", ) gui.separator(widget=selectionBox, height=3) # Options box... optionsBox = gui.widgetBox( widget=self.controlArea, box="Options", orientation="vertical", addSpace=False, ) gui.checkBox( widget=optionsBox, master=self, value='outputUtterances', label=u'Output utterance segmentation', callback=self.sendButton.settingsChanged, tooltip=u"Toggle emission of utterance segmentation on or off.", ) gui.separator(widget=optionsBox, height=1) gui.checkBox( widget=optionsBox, master=self, value='outputWords', label=u'Output word segmentation', callback=self.toggleWordOptions, tooltip=u"Toggle emission of word segmentation on or off.", ) gui.separator(widget=optionsBox, height=1) self.wordOptionsBox = gui.indentedBox( widget=optionsBox, orientation="horizontal", addSpace=False, ) gui.label( widget=self.wordOptionsBox, master=self, labelWidth=135, label="Word stem includes: ", tooltip="Select the elements that will be added to the stem.", ) gui.checkBox( widget=self.wordOptionsBox, master=self, labelWidth=85, value='includePOSTag', label=u'POS-tag', callback=self.sendButton.settingsChanged, tooltip=u"Add the part-of-speech tag (e.g. door => N|door).", ) gui.checkBox( widget=self.wordOptionsBox, master=self, labelWidth=85, value='includePrefixes', label=u'prefixes', callback=self.sendButton.settingsChanged, tooltip=u"Include the prefix (e.g. write => re#write).", ) gui.separator(widget=self.controlArea, height=3) gui.rubber(self.controlArea) # Now Info box and Send button must be drawn... self.sendButton.draw() self.infoBox.draw() # This initialization step needs to be done after infoBox has been # drawn (because we may need to display an error message). self.loadDatabaseCache() self.updateSelection() # Send data if autoSend. self.sendButton.sendIf() self.setMinimumWidth(350) self.adjustSizeWithTimer() def sendData(self): """Compute result of widget processing and send to output""" if not self.importedCorpora: self.infoBox.setText("Please add a corpus to the selection.", "warning") self.send("Files", None, self) self.send("Utterances", None, self) return # Clear created Inputs and initialize progress bar... self.clearCreatedInputs() numberOfSteps = 2 if self.outputUtterances else 1 numberOfSteps += 2 if self.outputWords else 0 self.infoBox.setText( "(1/%i) Retrieving data, please wait..." % numberOfSteps, "warning", ) self.controlArea.setDisabled(True) progressBar = ProgressBar(self, iterations=len(self.importedCorpora)) annotations = list() # Iterate over corpora... for importedCorpus in self.importedCorpora: corpus = importedCorpus.split("/")[-1] # Try to retrieve corpus from cache... try: basepath = os.path.dirname( os.path.abspath(inspect.getfile(inspect.currentframe()))) corpusFilepath = os.path.normpath( os.path.join( basepath, self.__class__.cachedFoldername, importedCorpus[len(self.__class__.baseUrl):], )) myZip = zipfile.ZipFile(corpusFilepath) except IOError: # Else try to download (and cache) requested zip file... try: response = requests.get(importedCorpus) myZip = zipfile.ZipFile(io.BytesIO(response.content)) corpusFolderpath = os.path.dirname(corpusFilepath) try: os.makedirs(corpusFolderpath) except OSError: pass try: outputFile = open(corpusFilepath, "wb") outputFile.write(response.content) outputFile.close() except IOError: pass # If an error occurs (e.g. connection error)... except: # Set Info box and widget to "error" state. self.infoBox.setText( "Couldn't download corpus %s from CHILDES website." % corpus, "error") # Reset output channel. self.send("Files", None, self) self.send("Utterances", None, self) progressBar.finish() self.controlArea.setDisabled(False) return # Create Input for each zipped file and store annotations... for file in myZip.infolist(): file_content = myZip.read(file).decode('utf-8') # If word segmentation is requested... if self.outputWords: # Implement replacements. file_content = re.sub( r"<w.+?(<replacement.+</replacement>).*?</w>", r"\1", file_content, ) # Prepend pre-clitics. file_content, n = re.subn( r"(<mor .+?)(<mor-pre>.+</mor-pre>)", r"\2\1", file_content, ) # Move <gra> into <mw>. file_content, n = re.subn( r"(</mw>)(<gra.+?/>)", r"\2\1", file_content, ) newInput = Input(file_content, self.captionTitle + "_files") self.createdInputs.append(newInput) chatSeg = Segmenter.import_xml(newInput, "CHAT") annotations.append(dict()) annotations[-1]["file_path"] = file.filename for key in ["Corpus", "Lang", "PID"]: try: annotations[-1][key.lower()] = \ chatSeg[0].annotations[key] except KeyError: pass participantListSeg = Segmenter.import_xml( newInput, "Participants") recodedInput, _ = Segmenter.recode( participantListSeg, [(re.compile("/>"), "> </participant>")]) participantSeg = Segmenter.import_xml(recodedInput, "participant") targetChildData = list() for participant in participantSeg: if participant.annotations["role"] != "Target_Child": continue targetChildData.append(dict()) if "age" in participant.annotations: targetChildData[-1]["target_child_age"] = \ participant.annotations["age"] age_parse = re.search( r"(\d+)Y(\d+)M(\d+)D", participant.annotations["age"], ) if age_parse: targetChildData[-1]["target_child_years"] = \ age_parse.group(1) months = int(age_parse.group(2)) \ + 12 * int(age_parse.group(1)) targetChildData[-1]["target_child_months"] = \ '%02d' % months days = int(age_parse.group(3)) \ + 30 * months targetChildData[-1]["target_child_days"] = \ '%02d' % days if "id" in participant.annotations: targetChildData[-1]["target_child_id"] = \ participant.annotations["id"] if "sex" in participant.annotations: targetChildData[-1]["target_child_sex"] = \ participant.annotations["sex"] if len(targetChildData) == 1: annotations[-1].update(targetChildData[0]) progressBar.advance() # If there's only one file, the widget's output is the created Input... if len(self.createdInputs) == 1: self.fileSegmentation = self.createdInputs[0] # Otherwise the widget's output is a concatenation... else: self.fileSegmentation = Segmenter.concatenate( self.createdInputs, self.captionTitle + "_files", import_labels_as=None, ) # Annotate segments... for idx, segment in enumerate(self.fileSegmentation): segment.annotations.update(annotations[idx]) self.fileSegmentation[idx] = segment # Terminate progress bar... progressBar.finish() message = "%i file@p" % len(self.fileSegmentation) message = pluralize(message, len(self.fileSegmentation)) self.send("Files", self.fileSegmentation, self) # Build utterance segmentation if needed... if self.outputUtterances: self.infoBox.setText( "(2/%i) Building utterance segmentation, please wait..." \ % numberOfSteps, "warning", ) progressBar = ProgressBar(self, iterations=len(self.fileSegmentation)) self.utteranceSegmentation = Segmenter.import_xml( self.fileSegmentation, "u", progress_callback=progressBar.advance, label=self.captionTitle + "_utterances", ) progressBar.finish() message += " and " if not self.outputWords else ", " message += "%i utterance@p" % len(self.utteranceSegmentation) message = pluralize(message, len(self.utteranceSegmentation)) self.send("Utterances", self.utteranceSegmentation, self) else: self.send("Utterances", None, self) # Build word segmentation if needed... if self.outputWords: self.infoBox.setText( "(%i/%i) Building word segmentation, please wait..." \ % (2 + (1 if self.outputUtterances else 0), numberOfSteps), "warning", ) try: baseSegmentation = self.utteranceSegmentation except: baseSegmentation = self.fileSegmentation progressBar = ProgressBar(self, iterations=2 * len(baseSegmentation)) wordSegmentation = Segmenter.import_xml( baseSegmentation, "w", progress_callback=progressBar.advance, ) mwSegmentation = Segmenter.import_xml( baseSegmentation, "mw", progress_callback=progressBar.advance, ) # Analyze words to extract annotations... self.infoBox.setText( "(%i/%i) Extracting word annotations, please wait..." \ % (3 + (1 if self.outputUtterances else 0), numberOfSteps), "warning", ) progressBar.finish() progressBar = ProgressBar(self, iterations=len(wordSegmentation)) wordSegments = list() for word in wordSegmentation: mws = word.get_contained_segments(mwSegmentation) if mws: for mw in mws: wordSegment = word.deepcopy() wordSegment.annotations.update( self.extractWordAnnotations(mw)) wordSegments.append(wordSegment) else: wordSegments.append(word) progressBar.advance() self.wordSegmentation = Segmentation( wordSegments, label=self.captionTitle + "_words", ) message += " and %i word@p" % len(self.wordSegmentation) message = pluralize(message, len(self.wordSegmentation)) self.send("Words", self.wordSegmentation, self) else: self.send("Words", None, self) # Set status to OK and report data size... message += " sent to output." message = pluralize(message, len(self.fileSegmentation)) self.infoBox.setText(message) progressBar.finish() self.controlArea.setDisabled(False) self.sendButton.resetSettingsChangedFlag() def extractWordAnnotations(self, mw): """Extract annotations from a word's mor tag in CHILDES XML format and return a dict of annotations. """ root = ET.fromstring("<mw>" + mw.get_content() + "</mw>") annotations = dict() pos_items = list() prefixes = list() suffixes = list() for child in root: if child.tag == "pos": for grandchild in child: if grandchild.tag == "c": pos_items.insert(0, grandchild.text) else: pos_items.append(grandchild.text) elif child.tag == "stem": stem = child.text elif child.tag == "mpfx": prefixes.append(child.text) elif child.tag == "mk": if child.attrib["type"] == "sfxf": suffixes.append("&" + child.text) elif child.attrib["type"] == "sfx": suffixes.append("-" + child.text) elif child.attrib["type"] == "mc": suffixes.append(":" + child.text) elif child.tag == "gra": for key in ["index", "head", "relation"]: annotations[key] = child.attrib[key] annotations["pos"] = ":".join(pos_items) if prefixes: annotations["prefixes"] = "#".join(prefixes) if self.includePrefixes: stem = annotations["prefixes"] + "#" + stem if suffixes: annotations["suffixes"] = "".join(suffixes) if self.includePOSTag: stem = annotations["pos"] + "|" + stem annotations["stem"] = stem return annotations def homeRefreshPressed(self): """Refresh database file tree""" if self.currentFolder != self.__class__.baseUrl: self.currentFolder = self.__class__.baseUrl self.updateDisplayedFolders() else: self.refreshDatabaseCache() def backPressed(self): """Display parent folder's contents""" self.currentFolder = "/".join(self.currentFolder[:-1].split("/")[:-1]) self.currentFolder += "/" self.updateDisplayedFolders() def corpusSelected(self): """Import selected corpus""" self.updateBrowseBoxButtons() def openPressed(self): """Display selected folder's contents""" if len(self.selectedInDisplayedFolder) == 1: self.currentFolder += self.displayedFolderLabels[ self.selectedInDisplayedFolder[0]] self.updateDisplayedFolders() def addPressed(self): """Import selected corpora""" corpora = list() for item in self.selectedInDisplayedFolder: label = self.displayedFolderLabels[item] self.getZipsFromItem(label, self.currentFolder, corpora) self.importedCorpora += corpora self.importedCorpora = sorted(list(set((self.importedCorpora)))) self.addButton.setDisabled(True) self.updateSelection() self.sendButton.settingsChanged() def getZipsFromItem(self, label, folder, zipList): """Get selected zip files or those contained in selected folders""" if label.endswith(".zip"): zipList.append(folder + label) return else: newFolder = self.getFolderContent(folder + label) for label in newFolder.keys(): if label.endswith(".zip"): label = newFolder[label] label = label[len(self.__class__.baseUrl):] self.getZipsFromItem(label, folder, zipList) def displayedFoldersDoubleClicked(self): """Reroute to 'openPressed' or 'addPressed' as needed""" if self.displayedFolderLabels[ self.selectedInDisplayedFolder[0]].endswith(".zip"): self.addPressed() else: self.openPressed() def removePressed(self): """Remove selected items""" for idx in sorted(self.selectedInSelection, reverse=True): del self.importedCorpora[idx] self.updateSelection() self.sendButton.settingsChanged() def clearPressed(self): """Empty the selection""" self.importedCorpora = list() self.updateSelection() self.sendButton.settingsChanged() def selectionDoubleClicked(self): """Reroute to removePressed""" self.removePressed() def toggleWordOptions(self): """Toggle display of word options on or off""" self.wordOptionsBox.setDisabled(not self.outputWords) self.sendButton.settingsChanged() def refreshDatabaseCache(self): """Refresh the database cache""" basepath = os.path.dirname( os.path.abspath(inspect.getfile(inspect.currentframe()))) cacheFoldername = self.__class__.cachedFoldername if os.path.exists(cacheFoldername) and list(os.walk('.'))[0]: dialog = AnyQt.QtGui.QMessageBox() response = dialog.question(self, "CHILDES", "Keep previously saved files?", dialog.Yes | dialog.No) self.infoBox.setText( "Connecting to CHILDES website, please wait...", "warning", ) progressBar = ProgressBar(self, iterations=1) self.controlArea.setDisabled(True) # Scrape website... self.database = dict() self.importedCorpora = list() try: self.recursivelyScrapeUrl( self.__class__.baseUrl, self.database, ) # Dump cache to file... path = os.path.dirname( os.path.abspath(inspect.getfile(inspect.currentframe()))) try: file = open( os.path.join(path, self.__class__.cacheFilename), "wb", ) pickle.dump(self.database, file) file.close() except IOError: self.infoBox.setText( "Couldn't save database to disk.", "warning", ) self.sendButton.settingsChanged() except requests.exceptions.ConnectionError: self.infoBox.setText( "Error while attempting to scrape the CHILDES website.", "error", ) self.send("Files", None, self) self.send("Utterances", None, self) # Remove saved files if required... try: if response == dialog.No: shutil.rmtree(cacheFoldername) except UnboundLocalError: pass progressBar.advance() progressBar.finish() self.currentFolder = self.__class__.baseUrl self.updateDisplayedFolders() self.updateSelection() self.controlArea.setDisabled(False) def recursivelyScrapeUrl(self, url, urls): """Scrape the CHILDES website recursively""" page = requests.get(url) soup = BeautifulSoup(page.text, "html.parser") links = soup.find_all('a') if links is None or len(links) == 0: return else: urls[url] = dict() for link in links: new_url = url + link["href"] if (link["href"].endswith("/") and len(link["href"]) > 1 and not link["href"].startswith("/data-xml/")): self.recursivelyScrapeUrl(new_url, urls[url]) elif link["href"].endswith(".zip"): urls[url][link["href"]] = new_url if len(urls[url]) == 0: del urls[url] def loadDatabaseCache(self): """Load the cached database""" # Try to open saved file in this module"s directory... path = os.path.dirname( os.path.abspath(inspect.getfile(inspect.currentframe()))) try: file = open(os.path.join(path, self.__class__.cacheFilename), "rb") self.database = pickle.load(file) file.close() self.currentFolder = self.__class__.baseUrl self.updateDisplayedFolders() # Else try to rebuild cache from CHILDES website... except IOError: self.refreshDatabaseCache() def updateDisplayedFolders(self): """Refresh state of displayed folder listbox""" # If database couldn't be loaded... if not self.database: self.currentFolderLabel.setText( "No database loaded, please click 'Refresh'.") self.homeRefreshButton.setDisabled(False) self.homeRefreshButton.setText("Refresh") self.backButton.setDisabled(True) self.openButton.setDisabled(True) self.addButton.setDisabled(True) return # Current folder label... currentFolder = self.currentFolder[len(self.__class__.baseUrl) - 1:] self.currentFolderLabel.setText("Current folder: " + currentFolder) # Populate listbox... folderContent = self.getFolderContent(self.currentFolder) displayedFolderLabels = list() for item in folderContent.keys(): if item.endswith(".zip"): displayedFolderLabels.append(item) else: displayedFolderLabels.append(item.split("/")[-2] + "/") self.displayedFolderLabels = displayedFolderLabels # Buttons. self.updateBrowseBoxButtons() def getFolderContent(self, folder): folderContent = self.database[self.__class__.baseUrl] folder = folder[len(self.__class__.baseUrl) - 1:] steps = folder[:-1].split("/")[1:] for idx, _ in enumerate(steps): path = self.__class__.baseUrl + "/".join(steps[:idx + 1]) + "/" folderContent = folderContent[path] return folderContent def updateBrowseBoxButtons(self): """Refresh state of Browse box buttons""" currentFolder = self.currentFolder[len(self.__class__.baseUrl) - 1:] if currentFolder == "/": self.homeRefreshButton.setText("Refresh") self.homeRefreshButton.setToolTip( "Connect to CHILDES website and refresh corpus list.") tooltip = "Return to database root.", else: self.homeRefreshButton.setText("Home") self.homeRefreshButton.setToolTip("Return to database root.") self.backButton.setDisabled(currentFolder == "/") self.openButton.setDisabled( len(self.selectedInDisplayedFolder) != 1 or self.displayedFolderLabels[ self.selectedInDisplayedFolder[0]].endswith(".zip")) self.addButton.setDisabled(not self.selectedInDisplayedFolder) def updateSelection(self): """Refresh state of selection listbox""" self.selectionLabels = [ corpus[len(self.__class__.baseUrl) - 1:] for corpus in self.importedCorpora ] self.updateRemovalButtons() def updateRemovalButtons(self): """Refresh state of Browse box buttons""" self.removeButton.setDisabled(not self.selectedInSelection) self.clearButton.setDisabled(not self.importedCorpora) def clearCreatedInputs(self): """Delete all Input objects that have been created.""" for i in self.createdInputs: Segmentation.set_data(i[0].str_index, None) del self.createdInputs[:] def onDeleteWidget(self): """Free memory when widget is deleted (overriden method)""" self.clearCreatedInputs() # The following method need to be copied (without any change) in # every Textable widget... def setCaption(self, title): if 'captionTitle' in dir(self): changed = title != self.captionTitle super().setCaption(title) if changed: self.sendButton.settingsChanged() else: super().setCaption(title)
class SpaCy(OWTextableBaseWidget): """Textable widget for NLP using spaCy.""" #---------------------------------------------------------------------- # Widget's metadata... name = "spaCy" description = "Natural language processing using spaCy" icon = "icons/spacy.svg" priority = 21 # TODO #---------------------------------------------------------------------- # Channel definitions... inputs = [("Text data", Segmentation, "inputData")] outputs = [ ("Tokenized text", Segmentation, widget.Default), ("Named entities", Segmentation), ("Noun chunks", Segmentation), ("Sentences", Segmentation), ] #---------------------------------------------------------------------- # Layout parameters... want_main_area = False #---------------------------------------------------------------------- # Settings... settingsHandler = VersionedSettingsHandler( version=__version__.rsplit(".", 1)[0] ) maxLen = settings.Setting("1000000") annotatePOSTags = settings.Setting(False) annotateDependencies = settings.Setting(False) annotateEntities = settings.Setting(False) segmentEntities = settings.Setting(False) segmentChunks = settings.Setting(False) segmentSentences = settings.Setting(False) autoSend = settings.Setting(False) model = settings.Setting("") def __init__(self): """Widget creator.""" super().__init__() if INSTALLED_MODELS: self.model = INSTALLED_MODELS[0] else: self.model = "" # Other attributes... self.inputSeg = None self.nlp = None self.selectedModels = list() self.downloadableModelLabels = list() self.loadedComponents = list() self.mustLoad = True # Next two instructions are helpers from TextableUtils. Corresponding # interface elements are declared here and actually drawn below (at # their position in the UI)... self.infoBox = InfoBox(widget=self.controlArea) self.sendButton = SendButton( widget=self.controlArea, master=self, callback=self.sendData, infoBoxAttribute="infoBox", sendIfPreCallback=None, ) # User interface... # Tabs... self.tabs = QTabWidget() self.optionsTab = QWidget() self.modelManagerTab = QWidget() self.tabs.addTab(self.optionsTab, "Options") self.tabs.addTab(self.modelManagerTab, "Model manager") # Options tab... OptionsTabBox = QHBoxLayout() optionsBox = gui.widgetBox(widget=self.optionsTab) self.modelComboBox = gui.comboBox( widget=optionsBox, master=self, value='model', label='Model: ', tooltip='Select the spaCy language model you want to use.', items=INSTALLED_MODELS[:], sendSelectedValue=True, callback=self.modelComboboxChanged, ) gui.separator(widget=optionsBox, height=3) annotationsBox = gui.widgetBox( widget=optionsBox, box="Additional token annotations:", ) annotationsBoxLine1 = gui.widgetBox( widget=annotationsBox, orientation="horizontal", box=None, ) gui.checkBox( widget=annotationsBoxLine1, master=self, value='annotatePOSTags', label='part-of-speech tags', callback=self.updateDisabledComponents, tooltip=("Annotate output tokens with part-of-speech tags."), ) self.annotatePOSTagsReloadLabel = gui.label( annotationsBoxLine1, master=self, label="(reload needed)", ) self.annotatePOSTagsReloadLabel.setStyleSheet( "font-style: oblique; color: gray" ) annotationsBoxLine2 = gui.widgetBox( widget=annotationsBox, orientation="horizontal", box=None, ) gui.checkBox( widget=annotationsBoxLine2, master=self, value='annotateDependencies', label='syntactic dependencies', callback=self.updateDisabledComponents, tooltip=("Annotate output tokens with syntactic dependencies."), ) self.annotateDependenciesReloadLabel = gui.label( annotationsBoxLine2, master=self, label="(reload needed)", ) self.annotateDependenciesReloadLabel.setStyleSheet( "font-style: oblique; color: gray" ) annotationsBoxLine3 = gui.widgetBox( widget=annotationsBox, orientation="horizontal", box=None, ) gui.checkBox( widget=annotationsBoxLine3, master=self, value='annotateEntities', label='named entities', callback=self.updateDisabledComponents, tooltip=("Annotate output tokens with named entities."), ) self.annotateEntitiesReloadLabel = gui.label( annotationsBoxLine3, master=self, label="(reload needed)", ) self.annotateEntitiesReloadLabel.setStyleSheet( "font-style: oblique; color: gray" ) segmentationsBox = gui.widgetBox( widget=optionsBox, box="Additional segmentations:", ) segmentationsBoxLine1 = gui.widgetBox( widget=segmentationsBox, orientation="horizontal", box=None, ) gui.checkBox( widget=segmentationsBoxLine1, master=self, value='segmentEntities', label='named entities', callback=self.updateDisabledComponents, tooltip="Output named entity segmentation on separate channel.", ) self.segmentEntitiesReloadLabel = gui.label( segmentationsBoxLine1, master=self, label="(reload needed)", ) self.segmentEntitiesReloadLabel.setStyleSheet( "font-style: oblique; color: gray" ) segmentationsBoxLine2 = gui.widgetBox( widget=segmentationsBox, orientation="horizontal", box=None, ) gui.checkBox( widget=segmentationsBoxLine2, master=self, value='segmentChunks', label='noun chunks', callback=self.updateDisabledComponents, tooltip="Output noun chunk segmentation on separate channel.", ) self.segmentChunksReloadLabel = gui.label( segmentationsBoxLine2, master=self, label="(reload needed)", ) self.segmentChunksReloadLabel.setStyleSheet( "font-style: oblique; color: gray" ) segmentationsBoxLine3 = gui.widgetBox( widget=segmentationsBox, orientation="horizontal", box=None, ) gui.checkBox( widget=segmentationsBoxLine3, master=self, value='segmentSentences', label='sentences', callback=self.updateDisabledComponents, tooltip="Output sentence segmentation on separate channel.", ) self.segmentSentencesReloadLabel = gui.label( segmentationsBoxLine3, master=self, label="(reload needed)", ) self.segmentSentencesReloadLabel.setStyleSheet( "font-style: oblique; color: gray" ) self.updateReloadNeededLabels() gui.comboBox( widget=optionsBox, master=self, value='maxLen', items=["1 million"] + ["%i millions" % l for l in range(2, 10)] \ + ["no limit"], sendSelectedValue=True, label=u'Max number of input characters:', tooltip=( "The spaCy parser and NER models require roughly 1GB of\n" "temporary memory per 100'000 characters in the input.\n" "This means long texts may cause memory allocation errors.\n" "If you're not using the parser or NER, or have lots of \n" "RAM, it's probably safe to increase the default limit of\n" "1 million characters." ), ) gui.rubber(optionsBox) OptionsTabBox.addWidget(optionsBox) self.optionsTab.setLayout(OptionsTabBox) # Model manager tab... modelManagerTabBox = QHBoxLayout() modelManagerBox = gui.widgetBox(widget=self.modelManagerTab) gui.label(modelManagerBox, self, label="Available models:") self.downloadableModelsListbox = gui.listBox( widget=modelManagerBox, master=self, value="selectedModels", labels="downloadableModelLabels", callback=self.downloadableModelsListboxChanged, tooltip="Select language models then click Download.", ) self.downloadableModelsListbox.setSelectionMode(3) self.downloadableModelLabels = DOWNLOADABLE_MODELS[:] self.downloadableModelLabels = self.downloadableModelLabels self.downloadButton = gui.button( widget=modelManagerBox, master=self, label="Download", callback=self.downloadModels, tooltip="Download the selected language models.", ) self.downloadButton.setDisabled(True) modelManagerTabBox.addWidget(modelManagerBox) self.modelManagerTab.setLayout(modelManagerTabBox) self.controlArea.layout().addWidget(self.tabs) gui.rubber(self.controlArea) # Now Info box and Send button must be drawn... self.sendButton.draw() self.infoBox.draw() self.infoBox.setText("Widget needs input.", "warning") # Check that there's a model... if not self.model: self.infoBox.setText( "Please download a language model first.", "warning", ) self.tabs.setCurrentIndex(1) optionsBox.setDisabled(True) def inputData(self, newInput): """Process incoming data.""" self.inputSeg = newInput self.infoBox.inputChanged() self.sendButton.sendIf() def modelComboboxChanged(self): """Respond to model change in UI (Options tab).""" self.mustLoad = True self.sendButton.settingsChanged() def downloadableModelsListboxChanged(self): """Respond to model change in UI (Model manager tab).""" self.downloadButton.setDisabled(len(self.selectedModels) == 0) def downloadModels(self): """Respond to Download button (Model manager tab).""" global INSTALLED_MODELS # Ask for confirmation... num_models = len(self.selectedModels) message = "Your are about to download %i language model@p. " + \ "This may take up to several minutes depending on your " + \ "internet connection. Do you want to proceed?" message = message % num_models buttonReply = QMessageBox.question( self, "Textable", pluralize(message, num_models), QMessageBox.Ok | QMessageBox.Cancel ) if buttonReply == QMessageBox.Cancel: return # Download models... self.controlArea.setDisabled(True) progressBar = ProgressBar(self, iterations=num_models) for model_idx in reversed(self.selectedModels): model = self.downloadableModelLabels[model_idx] download_spacy_model(AVAILABLE_MODELS[model]) del self.downloadableModelLabels[model_idx] progressBar.advance() # Update GUI... self.downloadableModelLabels = self.downloadableModelLabels self.selectedModels = list() progressBar.finish() self.controlArea.setDisabled(False) message = "Downloaded %i language model@p, please restart " + \ "Orange for changes to take effect." message = message % num_models QMessageBox.information( None, "Textable", pluralize(message, num_models), QMessageBox.Ok ) def updateDisabledComponents(self): """Load components if needed.""" self.updateReloadNeededLabels() self.sendButton.settingsChanged() def updateReloadNeededLabels(self): """Update the labels that indicate whether model reload is needed.""" self.annotatePOSTagsReloadLabel.setVisible( self.annotatePOSTags and ("tagger" not in self.loadedComponents) ) self.annotateDependenciesReloadLabel.setVisible( self.annotateDependencies and ("parser" not in self.loadedComponents) ) self.annotateEntitiesReloadLabel.setVisible( self.annotateEntities and ("ner" not in self.loadedComponents) ) self.segmentSentencesReloadLabel.setVisible( self.segmentSentences and "parser" not in self.loadedComponents ) self.segmentChunksReloadLabel.setVisible( self.segmentChunks and ( ("tagger" not in self.loadedComponents) or ("parser" not in self.loadedComponents) ) ) self.segmentEntitiesReloadLabel.setVisible( self.segmentEntities and "ner" not in self.loadedComponents ) def getComponentStatus(self): """Returns the list of disabled/enabled component based on UI state.""" disabledComponents = list() enabledComponents = list() if self.annotatePOSTags or self.segmentChunks: enabledComponents.append("tagger") else: disabledComponents.append("tagger") if self.annotateDependencies or self.segmentChunks or self.segmentSentences: enabledComponents.append("parser") else: disabledComponents.append("parser") if self.annotateEntities or self.segmentEntities: enabledComponents.append("ner") else: disabledComponents.append("ner") return disabledComponents, enabledComponents def loadModel(self): """(Re-)load language model if needed.""" # Initialize progress bar. self.infoBox.setText( u"Loading language model, please wait...", "warning", ) self.controlArea.setDisabled(True) progressBar = ProgressBar(self, iterations=1) disabled, enabled = self.getComponentStatus() self.nlp = spacy.load( AVAILABLE_MODELS[self.model], disable=disabled, ) self.loadedComponents = enabled self.updateReloadNeededLabels() self.mustLoad = False progressBar.advance() progressBar.finish() self.controlArea.setDisabled(False) def sendData(self): """Compute result of widget processing and send to output.""" # Check that there's a model... if not self.model: self.infoBox.setText( "Please download a language model first.", "warning", ) self.tabs.setCurrentIndex(1) return # Check that there's an input... if self.inputSeg is None: self.infoBox.setText("Widget needs input.", "warning") for channel in [c.name for c in self.outputs]: self.send(channel, None, self) return # Check max length and adjust if needed... inputLength = sum(len(s.get_content()) for s in self.inputSeg) if self.maxLen != "no limit": maxNumChar = int(self.maxLen.split()[0]) * 1000000 if inputLength > maxNumChar: self.infoBox.setText( "Input exceeds max number of characters set by user.", "warning", ) for channel in [c.name for c in self.outputs]: self.send(channel, None, self) return else: if inputLength > self.nlp.max_length: maxNumChar = inputLength # Load components if needed... disabled, enabled = self.getComponentStatus() if self.mustLoad or not( self.nlp and set(enabled) <= set(self.loadedComponents) ): self.loadModel() self.nlp.max_length = maxNumChar # Initialize progress bar. self.infoBox.setText( u"Processing, please wait...", "warning", ) self.controlArea.setDisabled(True) progressBar = ProgressBar(self, iterations=len(self.inputSeg)) tokenSegments = list() entitySegments = list() chunkSegments = list() sentenceSegments = list() # Process each input segment... for segment in self.inputSeg: # NLP analysis... disabled, _ = self.getComponentStatus() disabled = [c for c in disabled if c in set(self.loadedComponents)] with self.nlp.disable_pipes(*disabled): doc = self.nlp(segment.get_content()) # Get token segments... tokenSegments.extend(spacyItemsToSegments(doc, segment)) # Get named entity segments... if self.segmentEntities: entitySegments.extend(spacyItemsToSegments(doc.ents, segment)) # Get noun chunk segments... if self.segmentChunks: chunkSegments.extend( spacyItemsToSegments(doc.noun_chunks, segment), ) # Get sentences segments... if self.segmentSentences: sentenceSegments.extend( spacyItemsToSegments(doc.sents, segment), ) progressBar.advance() # Build segmentations and send them to output... tokenSeg = Segmentation(tokenSegments, self.captionTitle + "_tokens") self.send("Tokenized text", tokenSeg, self) if self.segmentChunks: chunkSeg = Segmentation( chunkSegments, self.captionTitle + "_chunks", ) self.send("Noun chunks", chunkSeg, self) if self.segmentEntities: entitySeg = Segmentation( entitySegments, self.captionTitle + "_entities", ) self.send("Named entities", entitySeg, self) if self.segmentSentences: sentenceSeg = Segmentation( sentenceSegments, self.captionTitle + "_sentences", ) self.send("Sentences", sentenceSeg, self) # Set status to OK and report data size... message = "%i token@p" % len(tokenSeg) message = pluralize(message, len(tokenSeg)) if self.segmentChunks: message += ", %i chunk@p" % len(chunkSeg) message = pluralize(message, len(chunkSeg)) if self.segmentEntities: message += ", %i " % len(entitySeg) message += "entity" if len(entitySeg) == 1 else "entities" if self.segmentSentences: message += ", %i sentence@p" % len(sentenceSeg) message = pluralize(message, len(sentenceSeg)) message += " sent to output." last_comma_idx = message.rfind(",") if last_comma_idx > -1: message = message[:last_comma_idx] + " and" + \ message[last_comma_idx+1:] self.infoBox.setText(message) # Clear progress bar. progressBar.finish() self.controlArea.setDisabled(False) self.sendButton.resetSettingsChangedFlag() # The following method needs to be copied verbatim in # every Textable widget that sends a segmentation... def setCaption(self, title): if 'captionTitle' in dir(self): changed = title != self.captionTitle super().setCaption(title) if changed: self.sendButton.settingsChanged() else: super().setCaption(title)
class OWTextableTextTree(OWTextableBaseWidget): """Orange widget for loading text folders""" name = "Text Tree" description = "Import data from raw text trees" icon = "icons/Textfolders.png" icon = "icons/textTree.svg" priority = 2 # Input and output channels... inputs = [ ('Message', JSONMessage, "inputMessage", widget.Single) ] outputs = [('Text data', Segmentation)] settingsHandler = VersionedSettingsHandler( version=__version__.rsplit(".", 1)[0] ) # Settings... autoSend = settings.Setting(True) folders = settings.Setting([]) encoding = settings.Setting('iso-8859-1') operation = settings.Setting('nothing') sampling =settings.Setting(100) autoNumber = settings.Setting(False) autoNumberKey = settings.Setting(u'num') importFilenames = settings.Setting(True) importFolderName = settings.Setting(True) importFolderNameKey = settings.Setting(u'folderName') importFileNameKey = settings.Setting(u'filename') FolderDepth1Key = settings.Setting(u'depth 1') FolderDepth2Key = settings.Setting(u'depth 2') FolderDepth2Key = settings.Setting(u'depth 3') FolderDepth2Key = settings.Setting(u'depth 4') FolderDepthLvl = settings.Setting(u'depth level') lastLocation = settings.Setting('.') displayAdvancedSettings = settings.Setting(False) folder = settings.Setting(u'') want_main_area = False def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) # Other attributes... self.segmentation = None self.operation = "no" self.applyInclusion = False self.applyExclusion = False self.applySampling = True self.samplingRate = 100 self.createdInputs = list() self.folderLabels = list() self.selectedfolderLabels = list() self.rootFolderPath = u'' self.inclusionsUser = u'' self.exclusionsUser = u'' self.newAnnotationKey = u'' self.newAnnotationValue = u'' self.folders = list() # self.folders is a list of dictionaries with each dictionaries being a a folder self.inclusionList = [".txt",".html",".xml",".csv"] #by default empty list # self.exclusionList = [".png,",".PNG",".jpg",".JPG",".gif",".GIF",".tiff",".TIFF",".jpeg",".JPEG",".DS_Store"] # by default exclusions : img files, .DS_Store (macOS) self.exclusionList = [] # by default null self.infoBox = InfoBox(widget=self.controlArea) # self.fileList = list() #output file list self.sendButton = SendButton( widget=self.controlArea, master=self, callback=self.sendData, infoBoxAttribute='infoBox', sendIfPreCallback=self.updateGUI, ) self.advancedSettings = AdvancedSettings( widget=self.controlArea, master=self, callback=self.sendButton.settingsChanged, ) # GUI... # Advanced settings checkbox... self.advancedSettings.draw() # BASIC GUI... # Basic folder box basicfolderBox = gui.widgetBox( widget=self.controlArea, box=u'Source', orientation='vertical', addSpace=False, ) basicfolderBoxLine1 = gui.widgetBox( widget=basicfolderBox, box=False, orientation='horizontal', ) gui.lineEdit( widget=basicfolderBoxLine1, master=self, value='rootFolderPath', orientation='horizontal', label=u'Folder path:', labelWidth=101, callback=self.add, tooltip=( u"The path of the folder." ), ) gui.separator(widget=basicfolderBoxLine1, width=5) gui.button( widget=basicfolderBoxLine1, master=self, label=u'Browse', callback=self.browse, tooltip=( u"Open a dialog for selecting a top folder." ), ) #gui.separator(widget=basicfolderBox, width=3) #gui.comboBox( # widget=basicfolderBox, # master=self, # value='encoding', # items=getPredefinedEncodings(), # sendSelectedValue=True, # orientation='horizontal', # label=u'Encoding:', # labelWidth=101, # callback=self.sendButton.settingsChanged, # tooltip=( # u"Select input folder(s) encoding." # ), # ) gui.separator(widget=basicfolderBox, width=3) self.advancedSettings.basicWidgets.append(basicfolderBox) self.advancedSettings.basicWidgetsAppendSeparator() # ADVANCED GUI... # folder box folderBox = gui.widgetBox( widget=self.controlArea, box=u'Sources', orientation='vertical', addSpace=False, ) folderBoxLine1 = gui.widgetBox( widget=folderBox, box=False, orientation='horizontal', addSpace=True, ) self.folderListbox = gui.listBox( widget=folderBoxLine1, master=self, value='selectedfolderLabels', labels='folderLabels', callback=self.updatefolderBoxButtons, tooltip=( u"The list of folders whose content will be imported.\n" u"\nIn the output segmentation, the content of each\n" u"folder appears in the same position as in the list.\n" u"\nColumn 1 shows the folder's name.\n" u"Column 2 shows the folder's annotation (if any).\n" u"Column 3 shows the folder's encoding." ), ) font = QFont() font.setFamily('Courier') font.setStyleHint(QFont.Courier) font.setPixelSize(12) self.folderListbox.setFont(font) folderBoxCol2 = gui.widgetBox( widget=folderBoxLine1, orientation='vertical', ) self.moveUpButton = gui.button( widget=folderBoxCol2, master=self, label=u'Move Up', callback=self.moveUp, tooltip=( u"Move the selected folder upward in the list." ), ) self.moveDownButton = gui.button( widget=folderBoxCol2, master=self, label=u'Move Down', callback=self.moveDown, tooltip=( u"Move the selected folder downward in the list." ), ) self.removeButton = gui.button( widget=folderBoxCol2, master=self, label=u'Remove', callback=self.remove, tooltip=( u"Remove the selected folder from the list." ), ) self.clearAllButton = gui.button( widget=folderBoxCol2, master=self, label=u'Clear All', callback=self.clearAll, tooltip=( u"Remove all folders from the list." ), ) self.exportButton = gui.button( widget=folderBoxCol2, master=self, label=u'Export List', callback=self.exportList, tooltip=( u"Open a dialog for selecting a folder where the folder\n" u"list can be exported in JSON format." ), ) self.importButton = gui.button( widget=folderBoxCol2, master=self, label=u'Import List', callback=self.importList, tooltip=( u"Open a dialog for selecting a folder list to\n" u"import (in JSON format). folders from this list\n" u"will be added to those already imported." ), ) folderBoxLine2 = gui.widgetBox( widget=folderBox, box=False, orientation='vertical', ) # Add folder box addfolderBox = gui.widgetBox( widget=folderBoxLine2, box=True, orientation='vertical', ) addfolderBoxLine1 = gui.widgetBox( widget=addfolderBox, orientation='horizontal', ) # Folder path input gui.lineEdit( widget=addfolderBoxLine1, master=self, value='rootFolderPath', orientation='horizontal', label=u'Folder path:', labelWidth=101, callback=self.updateGUI, tooltip=( u"The paths of the folders that will be added to the\n" u"list when button 'Add' is clicked.\n\n" u"Successive paths must be separated with ' / ' \n" u"(whitespace + slash + whitespace). Their order in\n" u"the list will be the same as in this field." ), ) gui.separator(widget=addfolderBoxLine1, width=5) # Button Browse gui.button( widget=addfolderBoxLine1, master=self, label=u'Browse', callback=self.browse, tooltip=( u"Open a dialog for selecting a top folder.\n\n" u"Selected folder paths will appear in the field to\n" u"the left of this button afterwards, ready to be\n" u"added to the list when button 'Add' is clicked." ), ) gui.separator(widget=addfolderBox, width=10) # Filter choice to include only certain files or to exclude files # ------------ # self.applyInclusion = False à mettre dans le init # gui.checkbox() # callback = lambda t=self.applyInclusion : includeLineEdit.setDisabled(not t) # includeLineEdit = gui.lineEdit() # ------------ # Filter box to input include only gui.separator(widget=addfolderBox, width=3) includeBoxLine1 = gui.widgetBox( widget=addfolderBox, box=False, orientation='horizontal', ) # Include only box gui.checkBox( widget=includeBoxLine1, master=self, value='applyInclusion', label=u'Include only', labelWidth=100, callback = lambda: includeLineEdit.setDisabled(not self.applyInclusion), tooltip=( u"Choose the inclusion" ), ) includeLineEdit = gui.lineEdit( widget=includeBoxLine1, master=self, value='inclusionsUser', orientation='horizontal', label=u'', disabled = True, labelWidth=101, tooltip=( u"This field lets you specify a custom filter\n" u"to select the folders to be\n" u"added to the list." ), ) # Filter box to exclude only gui.separator(widget=addfolderBox, width=3) excludeBoxLine1 = gui.widgetBox( widget=addfolderBox, box=False, orientation='horizontal', ) # Exclude only box gui.checkBox( widget=excludeBoxLine1, master=self, value='applyExclusion', label=u'Exclude', labelWidth=100, disabled = False, callback = lambda: includeLineEdit2.setDisabled(not self.applyExclusion), tooltip=( u"Exclude the inclusion" ), ) includeLineEdit2=gui.lineEdit( widget=excludeBoxLine1, master=self, value='exclusionsUser', orientation='horizontal', label=u'', disabled = True, labelWidth=101, tooltip=( u"This field lets you specify a custom filter\n" u"to select the folders to be\n" u"added to the list." ), ) # Sampling box to input the level of sampling gui.separator(widget=addfolderBox, width=3) samplingBoxLine1 = gui.widgetBox( widget=addfolderBox, box=False, orientation='horizontal', ) # Check box for sampling gui.checkBox( widget=samplingBoxLine1, master=self, value='applySampling', label=u'Sampling', labelWidth=100, disabled = False, callback = lambda: samplingSpin.setDisabled(not self.applySampling), tooltip=( u"Choose the sampling level" ), ) # Box to input the level of samplig, spin minv = 10 and maxv = 100 # self.importFilenamesKeyLineEdit = gui.spin( samplingSpin = gui.spin( widget=samplingBoxLine1, master=self, value='samplingRate', minv = 10, maxv = 100, labelWidth=50, orientation='horizontal', tooltip=( u"sampling level" ), ) gui.separator(widget=addfolderBox, width=3) self.addButton = gui.button( widget=addfolderBox, master=self, label=u'Add', callback=self.add, tooltip=( u"Add the folder(s) currently displayed in the\n" u"'folders' text field to the list.\n\n" u"Each of these folders will be associated with the\n" u"specified encoding and annotation (if any).\n\n" u"Other folders may be selected afterwards and\n" u"assigned a different encoding and annotation." ), ) self.advancedSettings.advancedWidgets.append(folderBox) self.advancedSettings.advancedWidgetsAppendSeparator() # Options box... optionsBox = gui.widgetBox( widget=self.controlArea, box=u'Options', orientation='vertical', addSpace=False, ) optionsBoxLine1 = gui.widgetBox( widget=optionsBox, box=False, orientation='horizontal', ) # gui.checkBox( # widget=optionsBoxLine1, # master=self, # value='importFilenames', # label=u'Import folder names with key:', # labelWidth=180, # callback=self.sendButton.settingsChanged, # tooltip=( # u"Import folder names as annotations." # ), # ) # self.importFilenamesKeyLineEdit = gui.lineEdit( # widget=optionsBoxLine1, # master=self, # value='importFilenamesKey', # orientation='horizontal', # callback=self.sendButton.settingsChanged, # tooltip=( # u"Annotation key for importing folder names." # ), # ) gui.separator(widget=optionsBox, width=3) optionsBoxLine2 = gui.widgetBox( widget=optionsBox, box=False, orientation='horizontal', ) gui.checkBox( widget=optionsBoxLine2, master=self, value='autoNumber', label=u'Auto-number with key:', labelWidth=180, callback=self.sendButton.settingsChanged, tooltip=( u"Annotate folders with increasing numeric indices." ), ) self.autoNumberKeyLineEdit = gui.lineEdit( widget=optionsBoxLine2, master=self, value='autoNumberKey', orientation='horizontal', callback=self.sendButton.settingsChanged, tooltip=( u"Annotation key for folder auto-numbering." ), ) gui.separator(widget=optionsBox, width=3) self.advancedSettings.advancedWidgets.append(optionsBox) self.advancedSettings.advancedWidgetsAppendSeparator() gui.rubber(self.controlArea) # Send button... self.sendButton.draw() # Info box... self.infoBox.draw() self.adjustSizeWithTimer() QTimer.singleShot(0, self.sendButton.sendIf) def inputMessage(self, message): """Handle JSON message on input connection""" if not message: return self.displayAdvancedSettings = True self.advancedSettings.setVisible(True) self.clearAll() self.infoBox.inputChanged() try: json_data = json.loads(message.content) temp_folders = list() for entry in json_data: path = entry.get('path', '') encoding = entry.get('encoding', '') annotationKey = entry.get('annotation_key', '') annotationValue = entry.get('annotation_value', '') if path == '' or encoding == '': self.infoBox.setText( u"Please verify keys and values of incoming " u"JSON message.", 'error' ) self.send('Text data', None, self) return depth = "0" options = "[i]:{unicorn}" temp_folders.append(( name, path, depth, options, )) self.folders.extend(temp_folders) self.sendButton.settingsChanged() except ValueError: self.infoBox.setText( u"Please make sure that incoming message is valid JSON.", 'error' ) self.send('Text data', None, self) return def sendData(self): """Load folders, create and send segmentation""" # Check that there's something on input... if ( (self.displayAdvancedSettings and not self.folders) or not (self.rootFolderPath or self.displayAdvancedSettings) ): self.infoBox.setText(u'Please select input folder.', 'warning') self.send('Text data', None, self) return # Check that autoNumberKey is not empty (if necessary)... if self.displayAdvancedSettings and self.autoNumber: if self.autoNumberKey: autoNumberKey = self.autoNumberKey else: self.infoBox.setText( u'Please enter an annotation key for auto-numbering.', 'warning' ) self.send('Text data', None, self) return else: autoNumberKey = None # Clear created Inputs... self.clearCreatedInputs() fileContents = list() annotations = list() counter = 1 if self.displayAdvancedSettings: myFolders = self.folders else: myFolders = [[self.rootFolderPath]] progressBar = gui.ProgressBar( self, iterations=len(myFolders) ) # Walk through each folder and open each files successively... fileContents = self.fileContents # Annotations... myFolders = self.folders for myFolder in myFolders: myFiles = myFolder['fileList'] for myFile in myFiles: # print(myFile) annotation = dict() if self.importFileNameKey: annotation[self.importFileNameKey] = myFile['fileName'] if self.importFolderNameKey: annotation[self.importFolderNameKey] = myFile['folderName'] if self.FolderDepth1Key: annotation[self.FolderDepth1Key] = myFile['depth1'] if self.FolderDepth2Key: annotation[self.FolderDepth2Key] = myFile['depth2'] if self.FolderDepthLvl: annotation[self.FolderDepthLvl] = myFile['depthLvl'] annotations.append(annotation) # progressBar.advance() # Create an LTTL.Input for each files... if len(fileContents) == 1: label = self.captionTitle else: label = None for index in range(len(fileContents)): myInput = Input(fileContents[index], label) segment = myInput[0] segment.annotations.update(annotations[index]) myInput[0] = segment self.createdInputs.append(myInput) # If there's only one file, the widget's output is the created Input. if len(fileContents) == 1: self.segmentation = self.createdInputs[0] # Otherwise the widget's output is a concatenation... else: self.segmentation = Segmenter.concatenate( segmentations=self.createdInputs, label=self.captionTitle, copy_annotations=True, import_labels_as=None, sort=False, auto_number_as=None, merge_duplicates=False, progress_callback=None, ) message = u'%i segment@p sent to output ' % len(self.segmentation) message = pluralize(message, len(self.segmentation)) numChars = 0 for segment in self.segmentation: segmentLength = len(Segmentation.get_data(segment.str_index)) numChars += segmentLength message += u'(%i character@p).' % numChars message = pluralize(message, numChars) self.infoBox.setText(message) progressBar.finish() self.send('Text data', self.segmentation, self) self.sendButton.resetSettingsChangedFlag() def clearCreatedInputs(self): for i in self.createdInputs: Segmentation.set_data(i[0].str_index, None) del self.createdInputs[:] def importList(self): """Display a folderDialog and import folder list""" folderPath = QFileDialog.getOpenFileName( self, u'Import folder List', self.lastLocation, u'Text folders (*)' ) if not folderPath: return self.rootFolderPath = os.path.normpath(folderPath) self.lastLocation = os.path.dirname(folderPath) self.error() try: folderHandle = codecs.open(folderPath, encoding='utf8') folderContent = folderHandle.read() folderHandle.close() except IOError: QMessageBox.warning( None, 'Textable', "Couldn't open folder.", QMessageBox.Ok ) return try: json_data = json.loads(folderContent) temp_folders = list() for entry in json_data: path = entry.get('path', '') encoding = entry.get('encoding', '') annotationKey = entry.get('annotation_key', '') annotationValue = entry.get('annotation_value', '') if path == '' or encoding == '': QMessageBox.warning( None, 'Textable', "Selected JSON folder doesn't have the right keys " "and/or values.", QMessageBox.Ok ) return temp_folders.append(( path, encoding, annotationKey, annotationValue, )) self.folders.extend(temp_folders) if temp_folders: self.sendButton.settingsChanged() except ValueError: QMessageBox.warning( None, 'Textable', "JSON parsing error.", QMessageBox.Ok ) return def exportList(self): """Display a folderDialog and export folder list""" toDump = list() myFolders = self.folders for myFolder in myFolders: toDump.append({ 'path': myFolder[0], 'encoding': myFolder[1], }) if myFolder[2] and myFolder[3]: toDump[-1]['annotation_key'] = myFolder[2] toDump[-1]['annotation_value'] = myFolder[3] folderPath =QFileDialog.getSaveFileName( self, u'Export folder List', self.lastLocation, ) if folderPath: self.lastLocation = os.path.dirname(folderPath) outputfolder = codecs.open( folderPath, encoding='utf8', mode='w', errors='xmlcharrefreplace', ) outputfolder.write( normalizeCarriageReturns( json.dumps(toDump, sort_keys=True, indent=4) ) ) outputfolder.close() QMessageBox.information( None, 'Textable', 'folder list correctly exported', QMessageBox.Ok ) def getFileList(self): #print("getFileList") initialRootParentPath, _ = os.path.split(self.rootFolderPath) #initial parent path is selected's folder parent folder fileListExt = list() # list of files matching default extension depthList = list() progressBar = gui.ProgressBar(self, iterations=1) for curr_path, dirnames, filenames in os.walk(self.rootFolderPath): #curr_path is a STRING, the path to the directory. #dirnames is a LIST of the names of subdirectories. #filenames is a LIST of the names of the files in curr_path #symlink non traités curr_rel_path = curr_path[len(initialRootParentPath)+1:] #defines current relative path by similar initial parent path part curr_rel_path_list = os.path.normpath(curr_rel_path).split(os.sep) #splits current relative path by os separator for filename in filenames: file = dict() # file = {"absoluteFilePath","foldername","filename","depth1","depth2","depth3","depth4","depth5","depth lvl"} # prev_non_excl_check = False # curr_non_excl_check = prev_non_excl_check #importing previous state of the "non-exclusion check" (opposite of exclusion check) annotations = curr_rel_path_list[:] # annotations are different subfolders browsed # print(annotations) curr_depth = len(annotations) depthList.append(curr_depth) file['absoluteFilePath'] = os.path.join(curr_path,filename) file['fileName'] = filename file['depthLvl'] = curr_depth file['folderName'] = annotations[0] for i in range(1, curr_depth): file['depth' + str(i)] = annotations[i] for i in range(curr_depth, 5): file['depth' + str(i)] = "0" # apply default file extension filter for extension in self.inclusionList: if filename.endswith(extension): fileListExt.append(file) # apply inclusion filter if self.applyInclusion: fileListIncl = [file for file in fileListExt # match in inclusion list if self.match(file['fileName'], self.inclusionsUserAsList)] else: fileListIncl = fileListExt # apply exclusion filter if self.applyExclusion: fileListExcl = [file for file in fileListIncl # no match in exclusion list if not self.match(file['fileName'], self.exclusionsUserAsList)] else: fileListExcl = fileListIncl # output file list self.fileList = fileListExcl if self.fileList: self.maxDepth = max(depthList) self.openFileList() else: self.maxDepth = 0 progressBar.advance() progressBar.finish() # test if file contains one of the patterns in patternList def match(self, file, patternList): for pattern in patternList: if pattern in file: return True return False def openFileList(self): self.fileContents = list() for file in self.fileList: fileContent = "" try: file_path = file['absoluteFilePath'] except TypeError: pass encodings = getPredefinedEncodings() with open(file_path,'rb') as opened_file: fileContent = opened_file.read() charset_dict = chardet.detect(fileContent) detected_encoding = charset_dict['encoding'] # i = 0 # chunks = list() # # for chunk in iter(lambda: opened_file.read(CHUNK_LENGTH), ""): # chunks.append('\n'.join(chunk.splitlines())) # i += CHUNK_LENGTH # if i % (CHUNK_NUM * CHUNK_LENGTH) == 0: # fileContent += "".join(str(chunks) # chunk = list() # # if len(chunks): # fileContent += "".join(str(chunks)) # del chunks try: encodings.remove(detected_encoding) encodings.insert(0,detected_encoding) except ValueError: pass for encoding in encodings: try: self.fileContent = fileContent.decode(encoding) except: pass # fileContent = normalize('NFC', str(fileContent)) # fileContents.append(fileContent) self.fileContents.append(self.fileContent) del self.fileContents[-1] # print(self.fileContents) def browse(self): """Display a QFileDialog and select a folder""" rootFolderPath = QFileDialog.getExistingDirectory( #Use QFileDialog.getExistingDirectory self, u'Select Folder(s)', self.lastLocation, ) if not rootFolderPath: return rootFolderPath = os.path.normpath(rootFolderPath) self.rootFolderPath = rootFolderPath self.lastLocation = rootFolderPath if self.displayAdvancedSettings: pass else: self.add() self.updateGUI() def moveUp(self): """Move folder upward in folders listbox""" if self.selectedfolderLabels: index = self.selectedfolderLabels[0] if index > 0: temp = self.folders[index - 1] self.folders[index - 1] = self.folders[index] self.folders[index] = temp self.selectedfolderLabels.listBox.item(index - 1).setSelected(1) self.sendButton.settingsChanged() def moveDown(self): """Move folder downward in folders listbox""" if self.selectedfolderLabels: index = self.selectedfolderLabels[0] if index < len(self.folders) - 1: temp = self.folders[index + 1] self.folders[index + 1] = self.folders[index] self.folders[index] = temp self.selectedfolderLabels.listBox.item(index + 1).setSelected(1) self.sendButton.settingsChanged() def clearAll(self): """Remove all folders from folders attr""" del self.folders[:] del self.selectedfolderLabels[:] self.sendButton.settingsChanged() def remove(self): """Remove folder from folders attr""" if self.selectedfolderLabels: index = self.selectedfolderLabels[0] self.folders.pop(index) del self.selectedfolderLabels[:] self.sendButton.settingsChanged() def add(self): """Add folders to folders attr""" #rootFolderPathList = re.split(r' +/ +', self.rootFolderPath) #self.rootFolderPath = name # identify sequences separated by a "," and suppress the white spaces self.inclusionsUserAsList = [x.strip() for x in self.inclusionsUser.split(",") if x.strip()] self.exclusionsUserAsList = [x.strip() for x in self.exclusionsUser.split(",") if x.strip()] self.getFileList() # display the list of files print("Files: ", list(map(lambda f: f['fileName'], self.fileList))) sampleFileList = self.sampleFileList() # display the list of sampled files print("Files after sampling: ", list(map(lambda f: f['fileName'], sampleFileList))) self.folders.append( { 'rootPath' : self.rootFolderPath, 'maxDepth' : self.maxDepth, 'inclusionsUser' : self.inclusionsUser, 'exclusionsUser' : self.exclusionsUser, 'samplingRate' : self.samplingRate, 'fileList' : sampleFileList, } ) # print(self.folders) self.sendButton.settingsChanged() # for folderDict in self.folders: # fileList = folderDict['fileList'] def sampleFileList(self): # Utilisation de la variable fileList # On fait une copie pour eviter de modifier self.fileList avec shuffle plus bas myList = list(self.fileList) # Initialisation d'un parametre qui decidera de l'echantillonage samplePercentage = self.samplingRate / 100.0 # print(samplePercentage) # On melange la liste pour prendre ensuite les "samplePercentage" premiers random.shuffle(myList) # On definit le nombre de fichiers voulus selon le parametre d'echantillonage "samplePercentage", arrondi au superieur nOfFiles = int(math.ceil(len(myList) * samplePercentage)) # On prend les "nOfFiles" premiers fichiers de la liste melangee return myList[:nOfFiles] def updateGUI(self): """Update GUI state""" if self.displayAdvancedSettings: if self.selectedfolderLabels: cachedLabel = self.selectedfolderLabels[0] else: cachedLabel = None del self.folderLabels[:] folderLabels = [] if self.folders: folderRootPathsList = [f['rootPath'] for f in self.folders] maxDepthList = ['%s' % f['maxDepth'] for f in self.folders] inclusionsUserList = [f['inclusionsUser'] for f in self.folders] # print(inclusionsUserList) exclusionsUserList = [f['exclusionsUser'] for f in self.folders] samplingRatesList = ['%s' % f['samplingRate'] for f in self.folders] folderNamesList = [os.path.basename(p) for p in folderRootPathsList] maxFolderNameLen = max([len(n) for n in folderNamesList]) for index in range(len(self.folders)): format = u'%-' + str(maxFolderNameLen + 2) + u's' # folderLabel = format % folderNamesList[index], folderLabel = format % folderNamesList[index] # print(inclusionsUserList[index]) folderLabel += "[d]:{"+maxDepthList[index]+"} " folderLabel += "[i]:{"+inclusionsUserList[index]+"} " folderLabel += "[e]:{"+exclusionsUserList[index]+"} " folderLabel += "[s]:{"+samplingRatesList[index]+"%}" folderLabels.append(folderLabel) self.folderLabels = folderLabels if cachedLabel is not None: self.sendButton.sendIfPreCallback = None self.selectedfolderLabels.listBox.item( cachedLabel ).setSelected(1) self.sendButton.sendIfPreCallback = self.updateGUI if self.rootFolderPath: if ( (self.newAnnotationKey and self.newAnnotationValue) or (not self.newAnnotationKey and not self.newAnnotationValue) ): self.addButton.setDisabled(False) else: self.addButton.setDisabled(True) else: self.addButton.setDisabled(True) if self.autoNumber: self.autoNumberKeyLineEdit.setDisabled(False) else: self.autoNumberKeyLineEdit.setDisabled(True) # if self.importFilenames: # self.importFilenamesKeyLineEdit.setDisabled(False) # else: # self.importFilenamesKeyLineEdit.setDisabled(True) self.updatefolderBoxButtons() self.advancedSettings.setVisible(True) else: self.advancedSettings.setVisible(False) def updatefolderBoxButtons(self): """Update state of folder box buttons""" if self.selectedfolderLabels: self.removeButton.setDisabled(False) if self.selectedfolderLabels[0] > 0: self.moveUpButton.setDisabled(False) else: self.moveUpButton.setDisabled(True) if self.selectedfolderLabels[0] < len(self.folders) - 1: self.moveDownButton.setDisabled(False) else: self.moveDownButton.setDisabled(True) else: self.moveUpButton.setDisabled(True) self.moveDownButton.setDisabled(True) self.removeButton.setDisabled(True) if len(self.folders): self.clearAllButton.setDisabled(False) self.exportButton.setDisabled(False) else: self.clearAllButton.setDisabled(True) self.exportButton.setDisabled(True) def setCaption(self, title): if 'captionTitle' in dir(self): changed = title != self.captionTitle super().setCaption(title) if changed: self.sendButton.settingsChanged() else: super().setCaption(title) def onDeleteWidget(self): self.clearCreatedInputs()
class MovieScripts(OWTextableBaseWidget): """Textable widget for importing movie scripts from the website IMSDB.com (https://www.imsdb.com) """ #---------------------------------------------------------------------- # Widget's metadata... name = "Movie Scripts" description = "Movie Script Importation" icon = "icons/Movie_Scripts.png" priority = 11 #---------------------------------------------------------------------- # Channel definitions... inputs = [] outputs = [("Movie Scripts importation", Segmentation)] #---------------------------------------------------------------------- # Layout parameters... want_main_area = False #---------------------------------------------------------------------- # Settings... settingsHandler = VersionedSettingsHandler( version=__version__.rsplit(".", 1)[0]) # Saved settings autoSend = settings.Setting(True) myBasket = settings.Setting([]) def __init__(self): """Widget creator.""" super().__init__() # ATTRIBUTS # searchFunction self.searchResults = None self.inputSeg = None # newQuery = attribut box lineEdit (search something) self.newQuery = '' self.nbr_results = 10 # Results box attributs self.titleLabels = list() self.selectedTitles = list() # selections box attributs self.myTitles = list() self.mytitleLabels = list() # stock all the inputs (songs) in a list self.createdInputs = list() self.sendData = '' self.newQuery = '' # Next two instructions are helpers from TextableUtils. Corresponding # interface elements are declared here and actually drawn below (at # their position in the UI)... self.infoBox = InfoBox(widget=self.controlArea) self.sendButton = SendButton( widget=self.controlArea, master=self, callback=self.sendData, infoBoxAttribute="infoBox", ) # User interface... # Create the working area queryBox = gui.widgetBox( widget=self.controlArea, box="Search movie", orientation="vertical", ) # Allows to enter specific text to the research # Uses "newQuery" attribut gui.lineEdit( widget=queryBox, master=self, value='newQuery', orientation='horizontal', label=u"Movie title: ", labelWidth=100, tooltip=("Enter a string"), ) # Allows to choose the wanted results numberp (10 by 10) queryNbr = gui.comboBox( widget=queryBox, master=self, value="nbr_results", items=[ "5", "10", "20", "30", "40", "50", ], sendSelectedValue=True, orientation="horizontal", label="Number of results: ", labelWidth=120, tooltip=("Please select the desired search.\n"), ) # Research button # Use "sendData" attibute self.searchButton = gui.button( widget=queryBox, master=self, label='search', callback=self.get_all_titles, tooltip='Search for the movie', ) self.searchButton.setDisabled(False) # Reasearch button # Uses "searchFunction" attribut self.titleListbox = gui.listBox( widget=queryBox, master=self, value="selectedTitles", # setting (list) labels="titleLabels", # setting (list) callback=lambda: self.addButton.setDisabled(self.selectedTitles == list()), tooltip="The list of titles whose content will be imported", ) self.titleListbox.setMinimumHeight(120) self.titleListbox.setSelectionMode(3) boxbutton = gui.widgetBox( widget=queryBox, box=False, orientation='horizontal', ) # select button # Uses "select" function self.selectButton = gui.button( widget=boxbutton, master=self, label="Select", # callback=self.clearResults, tooltip="Select", ) self.selectButton.setDisabled(True) # Clear button # Uses "clearResults" function self.clearButton = gui.button( widget=boxbutton, master=self, label="Clear results", # callback=self.clearResults, tooltip="Clear results", ) self.clearButton.setDisabled(True) gui.separator(widget=queryBox, height=3) gui.rubber(self.controlArea) #---------------------------------------------------------------------- # Draw Info box and Send button # self.sendButton.draw() # self.searchButton.setDefault(True) self.infoBox.draw() # Send data if autoSend. # self.sendButton.sendIf() # Get all movie titles from www.springfieldspringfield.co.uk def get_all_titles(self): php_query_string = '/movie_script.php?movie=' http_query_string = 'https://www.springfieldspringfield.co.uk/movie_scripts.php?order=' title_to_href = dict() for lettre in [ '0' ]: #, 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', #'N', 'O', 'P', 'K', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']: page_num = 1 while True: page_url = http_query_string + '%s&page=%i' % (lettre, page_num) page = urllib.request.urlopen(page_url) soup = BeautifulSoup(page, 'html.parser') script_links = soup.findAll( 'a', attrs={'class': re.compile("^script-list-item")}) if not script_links: break links = dict() for link in soup.findAll( 'a', attrs={'class': re.compile("^script-list-item")}): links[link.text] = link.get('href')[len(php_query_string):] title_to_href.update(links) print(page_num) page_num += 1 # print(title_to_href) '''ne fonctionne pas --> à voir pq...''' self.titleLabels.append(title_to_href) # Export file with all titles in a list def export_scripts(title_to_href): try: name_file = input('Entrez le nom du fichier à exporter: ') exported_file = open(name_file, 'w', encoding='utf8') exported_file.write(str(title_to_href)) exported_file.close() '{:*^20}'.format('title_to_href') except IOError: print('Impossible de lire le fichier') return # export_scripts(title_to_href) # Creat the final output with the script def sendData(self, title_to_href): """Send data from website springfieldspringfield""" #This is what will get the actual script of a single movie movie_names_row = input( '\033[31m Entrez le nom du film et l\'année entre parenthèses, ex : 99 Homes (2014) : \033[0m' ) #The first attribute of extract will be user's input, second is the list of all movie scripts, third is number of results determined by user movie_names = process.extractBests(movie_names_row, title_to_href.keys(), limit=1, score_cutoff=70) titles = [movie_name[0] for movie_name in movie_names] title = titles[0] print(title) if input('\033[31m Entrez "yes" pour continuer : \033[0m') == 'yes': if title in title_to_href: print(title_to_href[title]) else: print('Aucun résultat') page_url = "https://www.springfieldspringfield.co.uk/movie_script.php?movie=" + title_to_href[ title] page = urllib.request.urlopen(page_url) soup = BeautifulSoup(page, 'html.parser') script = soup.find("div", {"class": "movie_script"}) print(script.text) else: pass self.searchButton.setDisabled(False) # def searchFunction(self): # result_list = {"1": "a", "1": "a", "1": "a", "1": "a",} # query_string = self.newQuery # if query_string != "": # page = 1 # page_max = int(self.nbr_results)/10 # result_id = 0 # result_artist = [] # self.controlArea.setDisabled(True) # # Initialize progress bar. # progressBar = ProgressBar( # self, # iterations=page_max # ) # # Each result is stored in a dictionnary with its title, # # artist's name, artist's ID and URL path # for result in body: # result_id += 1 # title = result["result"]["title"] # artist = result["result"]["primary_artist"]["name"] # artist_id = result["result"]["primary_artist"]["id"] # path = result["result"]["path"] # result_list[result_id] = {'artist': artist, # 'artist_id':artist_id, # 'path':path, 'title':title} # page += 1 # # 1 tick on the progress bar of the widget # progressBar.advance() # # Stored the results list in the "result_list" variable # self.searchResults = result_list # # Reset and clear the visible widget list # del self.titleLabels[:] # # Update the results list with the search results # # in order to display them # for idx in self.searchResults: # result_string = self.searchResults[idx]["title"] + " - " + \ # self.searchResults[idx]["artist"] # self.titleLabels.append(result_string) # self.titleLabels = self.titleLabels # self.clearButton.setDisabled(False) # self.addButton.setDisabled(self.selectedTitles == list()) # # Clear progress bar. # progressBar.finish() # self.controlArea.setDisabled(False) # else: # self.infoBox.setText("You didn't search anything", "warning") # The following method needs to be copied verbatim in # every Textable widget that sends a segmentation... def setCaption(self, title): if 'captionTitle' in dir(self): changed = title != self.captionTitle super().setCaption(title) if changed: self.SendButton.settingsChanged() else: super().setCaption(title)
class Charnet(OWTextableBaseWidget): """Textable widget for building character networks with Charnet.""" #---------------------------------------------------------------------- # Widget's metadata... name = "Charnet" description = "Build character networks with the Charnet package" icon = "icons/charnet.svg" priority = 21 # TODO #---------------------------------------------------------------------- # Channel definitions... inputs = [("Text data", Segmentation, "inputData")] outputs = [("Character segmentation", Segmentation)] #---------------------------------------------------------------------- # Layout parameters... want_main_area = False #---------------------------------------------------------------------- # Settings... # TODO #---------------------------------------------------------------------- # The following lines need to be copied verbatim in every Textable widget... settingsHandler = VersionedSettingsHandler( version=__version__.rsplit(".", 1)[0] ) def __init__(self): """Widget creator.""" super().__init__() #---------------------------------------------------------------------- # Other (non settings) attributes... self.inputSeg = None self.selectedCharacters = list() self.characters = list() self.mustLoad = True if INSTALLED_MODELS: self.model = INSTALLED_MODELS[0] else: self.model = "" #---------------------------------------------------------------------- # Next two instructions are helpers from TextableUtils. Corresponding # interface elements are declared here and actually drawn below (at # their position in the UI)... self.infoBox = InfoBox(widget=self.controlArea) self.sendButton = SendButton( widget=self.controlArea, master=self, callback=self.sendData, infoBoxAttribute="infoBox", sendIfPreCallback=None, ) #---------------------------------------------------------------------- # User interface... self.characterListbox = gui.listBox( widget=self.controlArea, master=self, value="selectedCharacters", labels="characters", callback=None, tooltip="List of identified characters", ) gui.rubber(self.controlArea) #---------------------------------------------------------------------- # Draw Info box and Send button... self.sendButton.draw() self.infoBox.draw() self.infoBox.setText("Widget needs input.", "warning") # Check that there's a model... if not self.model: self.noLanguageModelWarning() def inputData(self, newInput): """Process incoming data.""" self.inputSeg = newInput if self.inputSeg is None: self.infoBox.setText("Widget needs input.", "warning") self.sendNoneToOutputs() self.characters = list() return self.updateCharacterList() self.infoBox.inputChanged() self.sendButton.sendIf() def updateCharacterList(self): """Update character list based on Charnet output.""" if self.mustLoad: self.loadModel() self.controlArea.setDisabled(True) progressBar = ProgressBar(self, iterations=4) string = " ".join(segment.get_content() for segment in self.inputSeg) progressBar.advance() self.char_df = charnet.extract_spacy_df(string, self.nlp) # TODO progress progressBar.advance() self.char_df = charnet.unify_tags(self.char_df) progressBar.advance() self.char_list = charnet.concatenate_parents(self.char_df, min_occ = 1) self.characters = [", ".join(char) for char in self.char_list] progressBar.advance() progressBar.finish() self.controlArea.setDisabled(False) def loadModel(self): """(Re-)load language model if needed.""" # Initialize progress bar. self.infoBox.setText( u"Loading language model, please wait...", "warning", ) self.controlArea.setDisabled(True) progressBar = ProgressBar(self, iterations=1) self.nlp = spacy.load( #AVAILABLE_MODELS[self.model], "en_core_web_sm", ) self.mustLoad = False progressBar.advance() progressBar.finish() self.controlArea.setDisabled(False) def noLanguageModelWarning(self): """"Warn user that a spaCy model must be installed and disable GUI.""" self.infoBox.setText( "Please use the spaCy widget to download a language " "model first.", "warning", ) self.controlArea.setDisabled(True) def sendNoneToOutputs(self): """Send None token to all output channels.""" for channel in [c.name for c in self.outputs]: self.send(channel, None, self) return def sendData(self): """Compute result of widget processing and send to output.""" # Check that there's a model... if not self.model: self.noLanguageModelWarning() self.sendNoneToOutputs() return # Check that there's an input... if self.inputSeg is None: self.infoBox.setText("Widget needs input.", "warning") self.sendNoneToOutputs() return # Initialize progress bar. self.infoBox.setText( u"Processing, please wait...", "warning", ) # Disable control area and initialize progress bar... self.controlArea.setDisabled(True) progressBar = ProgressBar(self, iterations=len(self.char_df)) # Get start and end pos of concatenated input segments... start_positions = [0] end_positions = list() num_segments = len(self.inputSeg) for idx in range(1, num_segments): prev_seg_len = len(self.inputSeg[idx-1].get_content()) start_positions.append(start_positions[-1] + prev_seg_len + 1) end_positions.append(start_positions[-1] - 1) end_positions.append(start_positions[-1] + len(self.inputSeg[-1].get_content()) + 1) # Initializations... char_segments = list() current_segment_idx = 0 # For each character token in Charnet's output... for index, char_token in self.char_df.iterrows(): # Get index of containing segment... while char_token["end_pos"] > end_positions[current_segment_idx]: current_segment_idx += 1 # Create segment for char with its actual coordinates... str_index = self.inputSeg[current_segment_idx].str_index start = char_token["start_pos"]-start_positions[current_segment_idx] end = char_token["end_pos"]-start_positions[current_segment_idx] char_segments.append(Segment(str_index, start, end)) progressBar.advance() # Send output... output_segmentation = Segmentation(char_segments, label=self.captionTitle) self.send("Character segmentation", output_segmentation, self) print(output_segmentation.to_string()) # Set status to OK and report data size... message = "%i segment@p sent to output." % len(output_segmentation) message = pluralize(message, len(output_segmentation)) self.infoBox.setText(message) # Clear progress bar. progressBar.finish() self.controlArea.setDisabled(False) self.sendButton.resetSettingsChangedFlag() #---------------------------------------------------------------------- # The following method needs to be copied verbatim in # every Textable widget that sends a segmentation... def setCaption(self, title): if 'captionTitle' in dir(self): changed = title != self.captionTitle super().setCaption(title) if changed: self.sendButton.settingsChanged() else: super().setCaption(title)
class Treetagger(OWTextableBaseWidget): """Orange widget to get corpus from pattern web""" #---------------------------------------------------------------------- # Widget"s metadata... name = "Treetagger" description = "..." icon = "icons/icon_treetagger.png" priority = 1 #---------------------------------------------------------------------- # Channel definitions... inputs = [("Text Input", Segmentation, "processInputData")] outputs = [("Text data", Segmentation)] #---------------------------------------------------------------------- # Settings... settingsHandler = VersionedSettingsHandler( version=__version__.rsplit(".", 1)[0]) autoSend = settings.Setting(False) unknown = settings.Setting(False) activer_xml = settings.Setting(False) want_main_area = False def __init__(self): """Widget creator.""" super().__init__() # NONE BASIC SETTING self.inputData = None self.system = os.name self.user = os.environ.get("USER") self.langues = list() self.created_inputs = list() self.language = 0 self.check_firt_use = False self.createdInputs = list() self.compteur = 0 self.NoLink = True # liste des langues possible self.langues_possibles = { "French": ["french.par", "french-abbreviations"], "English": ["english-utf8.par", "english-abbreviations"], "German": ["german-utf8.par", "german-abbreviations"], "Italian": ["italian-utf8.par", "italian-abbreviations"], "Swahili": ["swahili.par", "swahili-abbreviations"], "Portuguese": ["portuguese.par", "portuguese-abbreviations"], "Russian": ["russian.par", "russian-abbreviations"], "Spanish": ["spanish-utf8.par", "spanish-abbreviations", "spanish-mwls"], "Slovenian": ["slovenian-utf8.par"], "Slovak": ["slovak2-utf8.par"], "Romanian": ["romanian.par"], "Polish": ["polish-utf8.par"], "Mongolian": ["mongolian.par"], "Latin": ["latin.par"], "Galician": ["galician.par"], "Finnish": ["finnish-utf8.par"], "Estonian": ["estonian.par"], "Bulgarian": ["bulgarian-utf8.par"], "Spoken French": ["spoken-french.par", "french-abbreviations"] } # Next two instructions are helpers from TextableUtils. Corresponding # interface elements are declared here and actually drawn below (at # their position in the UI)... self.infoBox = InfoBox(widget=self.controlArea) self.sendButton = SendButton(widget=self.controlArea, master=self, callback=self.sendData, infoBoxAttribute=u"infoBox", sendIfPreCallback=self.updateGUI) # The AdvancedSettings class, also from TextableUtils, facilitates # the management of basic vs. advanced interface. An object from this # class (here assigned to self.advancedSettings) contains two lists # (basicWidgets and advancedWidgets), to which the corresponding # widgetBoxes must be added. # User interface... # OPTION BOX gui.separator(widget=self.controlArea, height=5) self.infoBox1 = gui.widgetBox(self.controlArea, u"Option", addSpace=True) # definir la langue self.langueBox = gui.comboBox(widget=self.infoBox1, master=self, value="language", items=self.langues, orientation=u"horizontal", label="Select text language :", callback=self.settings_changed) self.langueBox.setMaximumWidth(100) gui.separator(widget=self.controlArea, height=3) # Checkbox pour activer output avec code xml self.choix_xml = gui.checkBox(widget=self.infoBox1, master=self, value="activer_xml", label=" Output with XML code", callback=self.settings_changed) # Checkbox pour afficher unknown si le mot est inconnu self.choix_unknown = gui.checkBox(widget=self.infoBox1, master=self, value="unknown", label=" Output without '[unknown]'", callback=self.settings_changed) # The following lines: # Bouton pour aller cherche le lien vers treetagger... self.treetagger_box = gui.widgetBox( self.controlArea, u"Please, enter a correct path to TreeTagger :", addSpace=True) gui.button(widget=self.treetagger_box, master=self, label="Browse", callback=self.treetagger_search) gui.separator(widget=self.treetagger_box, height=3) gui.rubber(self.controlArea) # Now Info box and Send button must be drawn... self.sendButton.draw() self.infoBox.draw() # Send data if autoSend. self.sendButton.sendIf() # ajuster taille widjet self.adjustSizeWithTimer() # verifie lien treetagger self.treetagger_check() # ALL FUNCTIONS def treetagger_check(self): # liste des element que doit contenir le dossier treetagger... liste = list() tokenize = os.path.normpath("/cmd/tokenize.pl") tokenize_utf8 = os.path.normpath("/cmd/utf8-tokenize.perl") treetagger = os.path.normpath("/bin/tree-tagger") # definir le ce que l"on trouve dans le chemin vers treetagger if self.system == "nt": check_list = [tokenize, tokenize_utf8, treetagger + ".exe"] else: check_list = [tokenize, tokenize_utf8, treetagger] # definir le chemin vers treetagger automatiquement path = os.path.dirname( os.path.abspath(inspect.getfile( inspect.currentframe()))) # --> temporaire # stoquer le lien vers treetagger (windows ou autre)... if self.system == "nt": if os.path.exists("treetagger_link.txt"): file = open("treetagger_link.txt", "r") self.treetagger_link = file.read() else: self.treetagger_link = os.path.normpath("C:\TreeTagger") else: if os.path.exists(os.path.normpath("/Users/" + \ self.user + "/treetagger_link.txt")): file = open( os.path.normpath("/Users/" + self.user + "/treetagger_link.txt"), "r") self.treetagger_link = file.read() else: self.treetagger_link = os.path.normpath( "/Applications/TreeTagger") # verifier si le chemin est correcte for check in check_list: check = os.path.exists(self.treetagger_link + check) liste.append(check) # afficher le bouton pour aller chercher le lien # et verouiller le reste des posibilite... if False in liste: self.NoLink = True # botton encore visible et les autres verouille self.treetagger_box.setVisible(True) self.infoBox1.setDisabled(True) # afficher les probleme s"il y en a... if self.check_firt_use is False: self.infoBox.setText( u"Please click 'Browse' and select the path \ to TreeTagger base folder. ", "warning") else: self.infoBox.setText( u"Sorry, TreeTagger's link isn't correct.", "error") # cacher le bouton pour aller chercher le lien # et deverouiller le reste des posibilite... else: if self.check_firt_use is True: self.infoBox.setText( u"TreeTagger's link is correct !\n\n \ Now, Widget needs input.", "warning") else: self.infoBox.setText(u"Widget needs input.", "warning") # affiche les langues self.language_possibility() for langue_actualise in self.langues: self.langueBox.addItem(langue_actualise) # modification affichage de l"interface self.NoLink = False self.treetagger_box.setVisible(False) self.infoBox1.setDisabled(False) self.saveSettings() return liste def treetagger_search(self): # rentre un lien comme lien de base marche pas self.treetagger_link = os.path.normpath( str( QFileDialog.getExistingDirectory( self, u"Enter a path to Treetagger"))) # Try to save list in this module"s directory for future reference... if self.system == "nt": file = open("treetagger_link.txt", "w") else: file = open( os.path.normpath("/Users/" + self.user + "/treetagger_link.txt"), "w") file.write(self.treetagger_link) file.close() self.check_firt_use = True # verifie si le lien marche self.treetagger_check() def language_possibility(self): # initilise que les langues installees dans treetagger # la liste dans son dossier langue_verification = os.listdir(".") langues_presentes = list() # On cherche quelles langue sont installees dans l"ordinateur for langue in self.langues_possibles.keys(): check = True for file_utile in self.langues_possibles[langue]: check = check and os.path.isfile( os.path.normpath(self.treetagger_link + "/lib/" + file_utile)) if not check: break if check: langues_presentes.append(langue) self.langues = langues_presentes return langues_presentes #recoit l"input def processInputData(self, inputData): # ici on prend le input self.inputData = inputData #change l"infobox quand input change if self.compteur != 0: self.infoBox.inputChanged() # Send data to output. self.sendButton.sendIf() def sendData(self): # Si le lien vers treetagger n"est pas trouve if self.NoLink: self.infoBox.setText(u"Sorry, TreeTagger's link not found.", "error") self.send("Text data", None) # Important: if input data is None, propagate this value to output... elif not self.inputData: self.infoBox.setText(u"Widget needs input", "warning") self.send("Text data", None) # affiche que quelque chose se passe... else: self.infoBox.setText(u"TreeTagger is running...", "warning") # Initialisation de variables total_tagged_text = list() new_segmentations = list() i = 0 # Initialize progress bar. self.progressBar = gui.ProgressBar(self, iterations=5) # Copie de la segmentation avec ajout d"une annotation... copy_of_input_seg = Segmentation() copy_of_input_seg.label = self.inputData.label for seg_idx, segment in enumerate(self.inputData): attr = " ".join( ["%s='%s'" % item for item in segment.annotations.items()]) segment.annotations["tt_xb"] = attr copy_of_input_seg.append(segment) # avancer la progressBar d"un cran self.progressBar.advance() concatenated_text = copy_of_input_seg.to_string( formatting="<xb_tt %(tt_xb)s>%(__content__)s</xb_tt>", display_all=True, ) # avancer la progressBar d"un cran self.progressBar.advance() tagged_text = self.tag(concatenated_text) tagged_input = Input(tagged_text) tagged_segmentation = Segmenter.import_xml(tagged_input, "xb_tt") # avancer la progressBar d"un cran self.progressBar.advance() # Si checkBox xml active if self.activer_xml == True: xml_segmentation, _ = Segmenter.recode( tagged_segmentation, substitutions=[ (re.compile(r"<unknown>"), "[unknown]"), (re.compile(r"(.+)\t(.+)\t(.+?)(?=[\r\n])"), "<w lemma='&3' type='&2'>&1</w>"), (re.compile(r'"""'), '"""'), ], ) final_segmentation = xml_segmentation # Si checkBox xml desactive else: xml_segmentation, _ = Segmenter.recode( tagged_segmentation, substitutions=[ (re.compile(r"<unknown>"), "[unknown]"), (re.compile(r"(.+)\t(.+)\t(.+?)(?=[\r\n])"), "<w lemma='&3' type='&2'>&1</w>"), (re.compile(r'"""'), '"""'), ], ) final_segmentation = Segmenter.import_xml( xml_segmentation, "w") self.infoBox.dataSent("") # Enregistrer le lien de treetagger... if self.system == "nt": file = open("treetagger_link.txt", "w") else: file = open( os.path.normpath("/Users/" + self.user + "/treetagger_link.txt"), "w") file.write(self.treetagger_link) file.close() # Clear progress bar. self.progressBar.finish() # envoyer la seguementation self.send("Text data", final_segmentation, self) self.compteur += 1 self.sendButton.resetSettingsChangedFlag() def tag(self, inputData): # fichier temporaire... tmp = os.path.normpath(os.path.expanduser("~/tmp_file.txt")) tmp2 = os.path.normpath(os.path.expanduser("~/tmp_file2.txt")) # ecrire dans un premier fichier le texte f = open(tmp, "w", encoding="utf-8") f.write(inputData) f.close() # liste de langue en option... option = str() if self.langues[self.language] == "French": option = "-f" elif self.langues[self.language] == "English": option = "-e" elif self.langues[self.language] == "Italian": option = "-i" # commande perle executee pour separer le texte en mot if option: commande1 = [ "perl", os.path.normpath( self.treetagger_link + "/cmd/utf8-tokenize.perl" ), option, "-a", os.path.normpath( self.treetagger_link + "/lib/" + \ self.langues_possibles[self.langues[self.language]][1] ), tmp ] else: commande1 = [ "perl", os.path.normpath(self.treetagger_link + "/cmd/tokenize.pl"), "-a", os.path.normpath( self.treetagger_link + "/lib/" + \ self.langues_possibles[self.langues[self.language]][1] ), tmp ] # evoyer un ordre a la ligne de commande if self.system == "nt": outcom1 = sp.Popen(commande1, stdout=sp.PIPE, shell=True) out = outcom1.communicate()[0]\ .decode(encoding="utf-8", errors="ignore")\ .replace("\r", "") else: outcom1 = sp.Popen(commande1, stdout=sp.PIPE, shell=False) out = outcom1.communicate()[0]\ .decode(encoding="utf-8", errors="ignore") # avancer la progressBar d"un cran self.progressBar.advance() # ecrire dans un deuxieme fichier le texte separe en mots f = codecs.open(tmp2, "w", encoding="utf-8") f.write(out) f.close() if self.system == "nt": bin_treetagger = "/bin/tree-tagger.exe" else: bin_treetagger = "/bin/tree-tagger" # taguer le texte avec type et lemma if self.unknown == True: commande2 = [ os.path.normpath(self.treetagger_link + bin_treetagger), os.path.normpath( self.treetagger_link + "/lib/" + \ self.langues_possibles[self.langues[self.language]][0] ), "-token", "-lemma", "-sgml", "-no-unknown", "-quiet", tmp2 ] if self.unknown == False: commande2 = [ os.path.normpath(self.treetagger_link + bin_treetagger), os.path.normpath( self.treetagger_link + "/lib/" + \ self.langues_possibles[self.langues[self.language]][0] ), "-token", "-lemma", "-sgml", "-quiet", tmp2 ] if self.system == "nt": output = sp.Popen(commande2, stdout=sp.PIPE, shell=True) outtext = output.communicate()[0]\ .decode(encoding="utf-8", errors="ignore") else: output = sp.Popen(commande2, stdout=sp.PIPE, shell=False) outtext = output.communicate()[0]\ .decode(encoding="utf-8", errors="ignore") # supprimer ficher temporaire os.remove(tmp) os.remove(tmp2) # avancer la progressBar d"un cran self.progressBar.advance() return outtext def updateGUI(self): """Update GUI state""" pass def clearCreatedInputs(self): #Delete all Input objects that have been created. for i in self.createdInputs: Segmentation.set_data(i[0].str_index, None) del self.createdInputs[:] def settings_changed(self): # eviter qu"un changement arrive # si le widget n"a pas encore evoyer d"output... if self.compteur > 0: return self.sendButton.settingsChanged() def onDeleteWidget(self): """Free memory when widget is deleted (overriden method)""" self.clearCreatedInputs() def setCaption(self, title): if 'captionTitle' in dir(self): changed = title != self.captionTitle super().setCaption(title) if changed: self.sendButton.settingsChanged() else: super().setCaption(title)
class Charnetto(OWTextableBaseWidget): """Textable widget for building character networks with Charnetto.""" #---------------------------------------------------------------------- # Widget's metadata... name = "Charnetto" description = "Build character networks with the Charnetto package" icon = "icons/charnetto.svg" priority = 20 #---------------------------------------------------------------------- # Channel definitions... inputs = [("Text data", Segmentation, "inputData")] outputs = [("Character segmentation", Segmentation)] #---------------------------------------------------------------------- # Layout parameters... want_main_area = False #---------------------------------------------------------------------- # Settings... sourceType = settings.Setting("Plain text") minFreq = settings.Setting(1) model = settings.Setting("") #---------------------------------------------------------------------- # The following lines need to be copied verbatim in every Textable widget... settingsHandler = VersionedSettingsHandler( version=__version__.rsplit(".", 1)[0] ) def __init__(self): """Widget creator.""" super().__init__() #---------------------------------------------------------------------- # Other (non settings) attributes... self.inputSeg = None self.selectedCharacters = list() self.characters = list() if spacy_widget.INSTALLED_MODELS: self.model = spacy_widget.INSTALLED_MODELS[0] self.mustInstall = False else: self.model = "" self.mustInstall = True self.editsWereMade = False #---------------------------------------------------------------------- # Next two instructions are helpers from TextableUtils. Corresponding # interface elements are declared here and actually drawn below (at # their position in the UI)... self.infoBox = InfoBox(widget=self.controlArea) self.sendButton = SendButton( widget=self.controlArea, master=self, callback=self.sendData, infoBoxAttribute="infoBox", sendIfPreCallback=None, ) #---------------------------------------------------------------------- # User interface... # Sample box... self.optionsBox = gui.widgetBox( widget=self.controlArea, box="Options", orientation="vertical", ) # self.sourceTypeCombo = gui.comboBox( # widget=self.optionsBox, # master=self, # value="sourceType", # sendSelectedValue=True, # items=["Plain text", "IMSDB-formatted script"], # orientation="horizontal", # label="Source type:", # labelWidth=120, # callback=self.changeSourceType, # tooltip=( # "TODO\n" # "TODO\n" # "TODO\n" # ), # ) self.spacyModelCombo = gui.comboBox( widget=self.optionsBox, master=self, value="model", sendSelectedValue=True, items=spacy_widget.INSTALLED_MODELS, orientation="horizontal", label="SpaCy model:", labelWidth=120, callback=self.loadModel, tooltip=("Choose spaCy model for named entity recognition."), ) # gui.separator(widget=self.optionsBox, height=3) # minFreqSpin = gui.spin( # widget=self.optionsBox, # master=self, # value='minFreq', # minv=1, # maxv=1000, # orientation='horizontal', # label="Minimum frequency:", # labelWidth=120, # callback=self.sendButton.settingsChanged, # keyboardTracking=False, # tooltip=( # "TODO\n" # "TODO\n" # "TODO\n" # ), # ) # gui.separator(widget=self.optionsBox, height=3) # Character box... self.characterBox = gui.widgetBox( widget=self.controlArea, box="Edit character list", orientation="vertical", ) characterListbox = gui.listBox( widget=self.characterBox, master=self, value="selectedCharacters", labels="characters", callback=self.updateButtons, tooltip="List of identified characters", ) # TODO set min height self.characterButtonBox = gui.widgetBox( widget=self.characterBox, orientation="horizontal", ) self.newButton = gui.button( widget=self.characterButtonBox, master=self, label="New", callback=self.newCharacter, tooltip="Add a new entry to the character list.", ) self.editButton = gui.button( widget=self.characterButtonBox, master=self, label="Edit", callback=self.editCharacters, tooltip="Add the selected character list entry.", ) self.deleteButton = gui.button( widget=self.characterButtonBox, master=self, label="Delete", callback=self.deleteCharacter, tooltip="Delete the selected character list entry.", ) self.resetButton = gui.button( widget=self.characterButtonBox, master=self, label="Reset", callback=self.resetCharacters, tooltip="Revert all changes made to character list.", ) self.updateButtons() gui.rubber(self.controlArea) #---------------------------------------------------------------------- # Draw Info box and Send button... self.sendButton.draw() self.infoBox.draw() self.infoBox.setText("Widget needs input.", "warning") # Check that there's a model... if self.mustInstall: self.noLanguageModelWarning() else: self.loadModel() def inputData(self, newInput): """Process incoming data.""" if self.mustInstall: return self.inputSeg = newInput if self.inputSeg is None: self.infoBox.setText("Widget needs input.", "warning") self.sendNoneToOutputs() self.characters = list() return self.updateCharacterList() self.infoBox.inputChanged() self.sendButton.sendIf() def updateCharacterList(self): """Update character list based on Charnetto output.""" # Sanity checks... if not self.model or not self.inputSeg: return # Init UI... self.controlArea.setDisabled(True) progressBar = ProgressBar(self, iterations=4) # Get input strings... strings = [segment.get_content() for segment in self.inputSeg] progressBar.advance() # Extract character tokens... # if self.sourceType == "Plain text": # self.char_df = charnetto.extract_spacy_df(strings, self.nlp) # elif self.sourceType == "IMSDB-formatted script": # self.char_df = charnetto.extract_movie_df(" ".join(strings)) self.char_df = charnetto.extract_spacy_df(strings, self.nlp) # TODO deal with \n in names progressBar.advance() # Unify spaCy tags to match those of flair... self.char_df = charnetto.unify_tags(self.char_df) progressBar.advance() # Collapse characters whose name is the prefix of another. self.char_list = charnetto.concatenate_parents(self.char_df, min_occ = 1) # Build char list and reset UI. self.characters = [", ".join(char) for char in self.char_list] progressBar.advance() progressBar.finish() self.controlArea.setDisabled(False) # Cache character list for resetting if needed. self.cachedCaracters = self.characters[:] def loadModel(self): """(Re-)load language model if needed.""" # Display warning, disable UI and initialize progress bar... self.infoBox.setText( u"Loading language model, please wait...", "warning", ) self.controlArea.setDisabled(True) progressBar = ProgressBar(self, iterations=1) # Load model and reset UI. self.nlp = spacy.load(spacy_widget.AVAILABLE_MODELS[self.model]) progressBar.advance() progressBar.finish() self.controlArea.setDisabled(False) # Update char list if there's an input... if self.inputSeg: self.updateCharacterList() self.sendButton.settingsChanged() def noLanguageModelWarning(self): """"Warn user that a spaCy model must be installed and disable GUI.""" self.infoBox.setText( "Please first use the spaCy widget to download a language " "model, then create a new copy of the Charnetto widget.", "warning", ) self.controlArea.setDisabled(True) def changeSourceType(self): """"Deal with user-requested source type change.""" self.spacyModelCombo.setDisabled(self.sourceType == "IMSDB-formatted script") # Update char list if there's an input... if self.inputSeg: self.updateCharacterList() self.sendButton.settingsChanged() def newCharacter(self): """"Add new character to list.""" new_value, ok = QInputDialog.getText(self, "New character", "Enter new line:") if ok and self.checkInputValidity(new_value): self.editsWereMade = True self.characters.append(str(new_value)) self.characters = self.characters self.sendButton.settingsChanged() def editCharacters(self): """"Deal with user requested edition of character in list.""" selected_idx = self.selectedCharacters[0] old_value = self.characters[selected_idx] new_value, ok = QInputDialog.getText(self, "Edit character", "Enter new value for this line:", text=old_value) if ok and self.checkInputValidity(new_value): if new_value != old_value: self.editsWereMade = True self.characters[selected_idx] = str(new_value) self.characters = self.characters self.sendButton.settingsChanged() def deleteCharacter(self): """"Deal with user requested deletion of character in list.""" selected_idx = self.selectedCharacters[0] old_value = self.characters[selected_idx] answer = QMessageBox.question(self, "Delete character", f"Do you really want to delete line '{old_value}'") if answer == QMessageBox.Yes: self.editsWereMade = True del self.characters[selected_idx] self.characters = self.characters self.sendButton.settingsChanged() def resetCharacters(self): """"Revert all edits to character list.""" self.characters = self.cachedCaracters[:] self.editsWereMade = False self.resetButton.setDisabled(not self.editsWereMade) self.sendButton.settingsChanged() def checkInputValidity(self, value): """"Check validity of user-submitted character list entry.""" if value == "": QMessageBox.warning(self, "Invalid input", "Please submit a nonempty string value.") return False if [item for item in value.split(", ") if item == ""]: QMessageBox.warning(self, "Invalid input", "Please make sure your entry consists in nonempty strings " "separated by \", \".") return False return True def updateButtons(self): """Enable/disable buttons depending on selection in list.""" self.editButton.setDisabled(len(self.selectedCharacters) == 0) self.deleteButton.setDisabled(len(self.selectedCharacters) == 0) self.resetButton.setDisabled(not self.editsWereMade) def sendNoneToOutputs(self): """Send None token to all output channels.""" for channel in [c.name for c in self.outputs]: self.send(channel, None, self) return def sendData(self): """Compute result of widget processing and send to output.""" # Check that there's a model... if not self.model: self.noLanguageModelWarning() self.sendNoneToOutputs() return # Check that there's an input... if self.inputSeg is None: self.infoBox.setText("Widget needs input.", "warning") self.sendNoneToOutputs() return # Initialize progress bar. self.infoBox.setText( u"Processing, please wait...", "warning", ) # Disable control area and initialize progress bar... self.controlArea.setDisabled(True) progressBar = ProgressBar(self, iterations=len(self.char_df)) # Get start and end pos of concatenated input segments... startPositions = [0] endPositions = list() numSegments = len(self.inputSeg) for idx in range(1, numSegments): prevSegLen = len(self.inputSeg[idx-1].get_content()) startPositions.append(startPositions[-1] + prevSegLen + 1) endPositions.append(startPositions[-1] - 1) endPositions.append(startPositions[-1] + len(self.inputSeg[-1].get_content()) + 1) # Get or update character aliases... find_pairs = sys.modules['charnetto.find_pairs'] characters = [entry.split(", ") for entry in self.characters] find_pairs.map_names(self.char_df, characters) # Initializations... charSegments = list() currentSegmentIdx = 0 # For each character token in Charnetto's output... for index, charToken in self.char_df.iterrows(): # Skip non-PER named entities. if charToken["tag"] != "PER": continue # Get index of containing segment... while charToken["end_pos"] > endPositions[currentSegmentIdx]: currentSegmentIdx += 1 # Create segment for char with its actual coordinates... strIndex = self.inputSeg[currentSegmentIdx].str_index start = charToken["start_pos"]-startPositions[currentSegmentIdx] end = charToken["end_pos"]-startPositions[currentSegmentIdx] annotations = {"id": charToken["alias"]} charSegments.append(Segment(strIndex, start, end, annotations)) progressBar.advance() # Send output... outputSegmentation = Segmentation(charSegments, label=self.captionTitle) self.send("Character segmentation", outputSegmentation, self) print(outputSegmentation.to_string()) # Set status to OK and report data size... message = "%i segment@p sent to output." % len(outputSegmentation) message = pluralize(message, len(outputSegmentation)) self.infoBox.setText(message) # Clear progress bar. progressBar.finish() self.controlArea.setDisabled(False) self.sendButton.resetSettingsChangedFlag() #---------------------------------------------------------------------- # The following method needs to be copied verbatim in # every Textable widget that sends a segmentation... def setCaption(self, title): if 'captionTitle' in dir(self): changed = title != self.captionTitle super().setCaption(title) if changed: self.sendButton.settingsChanged() else: super().setCaption(title)
class MovieTranscripts(OWTextableBaseWidget): """Textable widget for importing movie scripts from the springfieldspringfield.co.uk website (https://www.springfieldspringfield.co.uk) """ #---------------------------------------------------------------------- # Widget's metadata... name = "Movie Transcripts" description = "Import movie transcripts from www.springfieldspringfield.co.uk" icon = "icons/Movie_Transcripts.png" priority = 11 #---------------------------------------------------------------------- # Channel definitions... inputs = [] outputs = [("Movie transcripts", Segmentation)] #---------------------------------------------------------------------- # Layout parameters... want_main_area = False #---------------------------------------------------------------------- # Settings... settingsHandler = VersionedSettingsHandler( version=__version__.rsplit(".", 1)[0] ) # Saved settings autoSend = settings.Setting(False) myBasket = settings.Setting([]) # Other class variables... cacheFilename = "cache_movie_transcripts" def __init__(self): """Widget creator.""" super().__init__() # ATTRIBUTS # searchFunction self.searchResults = None self.inputSeg = None # newQuery = attribut box lineEdit (search something) self.newQuery = '' # Results box attributs self.titleLabels = list() self.selectedTitles = list() # selections box attributs self.myTitles = list() self.mytitleLabels = list() # stock all the inputs (scripts) in a list self.createdInputs = list() # stock the part of dictionary that will be used to access script's page self.path_storage = dict() # stock titles of movies self.movie_titles = list() # stock all the movies titles and link parts self.title_to_href = dict() # Next two instructions are helpers from TextableUtils. Corresponding # interface elements are declared here and actually drawn below (at # their position in the UI)... self.infoBox = InfoBox(widget=self.controlArea) self.sendButton = SendButton( widget=self.controlArea, master=self, callback=self.sendData, infoBoxAttribute="infoBox", ) # User interface... # Create the working area queryBox = gui.widgetBox( widget=self.controlArea, box="Search movie", orientation="vertical", ) searchBox = gui.widgetBox( widget=queryBox, orientation="horizontal", ) # Allows to enter specific text to the research # Uses "newQuery" attribute gui.lineEdit( widget=searchBox, master=self, value='newQuery', orientation='horizontal', labelWidth=100, tooltip=("Enter a movie title"), ) # Research button # Use "searchFunction" attibute self.searchButton = gui.button( widget=searchBox, master=self, label='Search', callback=self.searchFunction, tooltip='Search for the movie', ) gui.separator(widget=queryBox, height=3) # Button that refresh all movie titles from the website self.refreshButton = gui.button( widget=queryBox, master=self, label="Refresh database", callback=self.refreshTitles, tooltip="Update SpringfieldSpringfield database" ) # Box that displays search results self.titleListbox = gui.listBox( widget=queryBox, master=self, value="selectedTitles", # setting (list) labels="titleLabels", # setting (list) callback=lambda: self.selectButton.setDisabled( self.selectedTitles == list()), tooltip="Select the movie transcript you want to import", ) self.titleListbox.doubleClicked.connect(self.Add) self.titleListbox.setMinimumHeight(120) self.titleListbox.setSelectionMode(3) boxbutton = gui.widgetBox( widget=queryBox, box=False, orientation='horizontal', ) # Add button # Uses "Add" function self.selectButton = gui.button( widget=boxbutton, master=self, label="Add to corpus", callback=self.Add, tooltip="Add selected movie to the corpus", ) self.selectButton.setDisabled(True) # Clear button # Uses "clearResults" function self.clearButton = gui.button( widget=boxbutton, master=self, label="Clear results", callback=self.clearResults, tooltip="Clear results", ) self.clearButton.setDisabled(True) # Area where confirmed movies are moved and stocked mytitleBox = gui.widgetBox( widget=self.controlArea, box="Corpus", orientation="vertical", ) self.mytitleListbox = gui.listBox( widget=mytitleBox, master=self, value="myTitles", labels="mytitleLabels", callback=lambda: self.removeButton.setDisabled( self.myTitles == list()), tooltip="The list of titles whose content will be imported", ) self.mytitleListbox.doubleClicked.connect(self.Remove) self.mytitleListbox.setMinimumHeight(150) self.mytitleListbox.setSelectionMode(3) boxbutton2 = gui.widgetBox( widget=mytitleBox, box=False, orientation='horizontal', ) # Remove movies button self.removeButton = gui.button( widget=boxbutton2, master=self, label=u'Remove from corpus', callback=self.Remove, tooltip="Remove the selected movie from your corpus.", ) self.removeButton.setDisabled(True) # Delete all confirmed movies button self.clearmyBasket = gui.button( widget=boxbutton2, master=self, label=u'Clear corpus', callback=self.ClearmyCorpus, tooltip="Remove all movies from your corpus.", ) self.clearmyBasket.setDisabled(True) gui.rubber(self.controlArea) #---------------------------------------------------------------------- # Draw Info box and Send button self.sendButton.draw() self.searchButton.setDefault(True) self.infoBox.draw() # This initialization step needs to be done after infoBox has been # drawn (because we may need to display an error message). self.loadDatabaseCache() # Make sure that whatever was in the corpus last time is deleted self.ClearmyCorpus() # Send data if autoSend. self.sendButton.sendIf() def searchFunction(self): self.controlArea.setDisabled(True) # Search from the springfieldspringfield.co.uk query_string = self.newQuery testdict = self.title_to_href # Reset and clear the visible widget list del self.titleLabels[:] self.titleLabels = self.titleLabels del self.movie_titles[:] self.movie_titles = self.movie_titles if query_string != "": # Initialize progress bar. progressBar = ProgressBar(self, iterations=1) self.searchResults = process.extractBests( query_string, testdict, limit=100000, score_cutoff=80 ) progressBar.finish() progressBar = ProgressBar(self, iterations=len(self.searchResults)) for key, score, val in self.searchResults: self.titleLabels.append(val) self.movie_titles.append(val) self.path_storage[val] = key # 1 tick on the progress bar of the widget progressBar.advance() self.titleLabels = self.titleLabels self.clearButton.setDisabled(False) self.controlArea.setDisabled(False) # Clear progress bar. progressBar.finish() if self.searchResults: self.infoBox.setText("Search complete") elif self.searchResults == []: self.infoBox.setText("No result please try again", 'warning') else: self.infoBox.setText( "Please, enter a query in a search bar", "warning" ) self.controlArea.setDisabled(False) def clearResults(self): """Clear the results list""" del self.titleLabels[:] self.titleLabels = self.titleLabels del self.movie_titles[:] self.movie_titles = self.movie_titles self.clearButton.setDisabled(True) def loadDatabaseCache(self): """Load the cached database""" # Try to open saved file in this module's directory... path = os.path.dirname( os.path.abspath(inspect.getfile(inspect.currentframe())) ) try: file = open(os.path.join(path, self.__class__.cacheFilename), "rb") self.title_to_href = pickle.load(file) file.close() # Else try to rebuild cache from SpringfieldSpringfield website... except IOError: self.refreshTitles() def refreshTitles(self): """Refresh the database cache""" basepath = os.path.dirname( os.path.abspath(inspect.getfile(inspect.currentframe())) ) cachedFilename = self.__class__.cacheFilename dialog = PyQt5.QtWidgets.QMessageBox() response = dialog.question( self, "springfieldspringfield", "Are you sure you want to refresh the database?\n" + "It will take several minutes", dialog.Yes | dialog.No ) self.infoBox.setText( "Scraping SpringfieldSpringfield website, please wait...", "warning", ) self.warning("Warning : it will take several minutes") if response == dialog.No: return else: try: self.get_all_titles() try: path = os.path.dirname( os.path.abspath(inspect.getfile(inspect.currentframe())) ) file = open( os.path.join(path, self.__class__.cacheFilename), "wb", ) pickle.dump(self.title_to_href, file) file.close() self.infoBox.setText( "Database successfully updated", ) except IOError: self.infoBox.setText( "Couldn't save database to disk.", "warning", ) except requests.exceptions.ConnectionError: self.infoBox.setText( "Error while attempting to scrape the " + "SpringfieldSpringfield website.", "error", ) # Get all movie titles from www.springfieldspringfield.co.uk def get_all_titles(self): '''php_query_string and http_query_string are the variable that will need to be changed if different database is used or if current database's structure undergoes changes''' php_query_string = '/movie_script.php?movie=' http_query_string = 'https://www.springfieldspringfield.co.uk/' + \ 'movie_scripts.php?order=' alphabet = ['0', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'K', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'] # Initialize progress bar. progressBar = ProgressBar( self, iterations=len(alphabet) ) self.controlArea.setDisabled(True) try: for lettre in alphabet: page_num = 1 # 1 tick on the progress bar of the widget progressBar.advance() # This part of code is what gets all the movie titles from each # page of the database while True: page_url = http_query_string + '%s&page=%i' % ( lettre, page_num, ) page = urllib.request.urlopen(page_url) soup = BeautifulSoup(page, 'html.parser') # script_links is a variable that may need to be changed if # another database is used or current database undergoes # change script_links = soup.findAll('a', attrs={'class': re.compile("^script-list-item")}) if not script_links: break links = dict() for link in soup.findAll( 'a', attrs={'class': re.compile("^script-list-item")} ): links[link.text] = \ link.get('href')[len(php_query_string):] self.title_to_href.update(links) page_num += 1 except: self.infoBox.setText( "Couldn't download data from springfieldspringfield website.", "error" ) # Clear progress bar. progressBar.finish() self.controlArea.setDisabled(False) return self.title_to_href # Add Movies function def Add(self): """Add movies in your selection """ for selectedTitle in self.selectedTitles: movie_title = self.titleLabels[selectedTitle] if movie_title not in self.myBasket: self.myBasket.append(movie_title) self.mytitleLabels.append(movie_title) self.mytitleLabels = self.mytitleLabels self.clearmyBasket.setDisabled(False) self.sendButton.settingsChanged() # Remove movies function def Remove(self): """Remove the selected songs in your selection """ self.myBasket = [ movie for idx, movie in enumerate(self.myBasket) if idx not in self.myTitles ] self.updateMytitleLabels() self.sendButton.settingsChanged() def ClearmyCorpus(self): """Clears your selection """ del self.mytitleLabels[:] del self.myBasket[:] self.mytitleLabels = self.mytitleLabels self.clearmyBasket.setDisabled(True) self.sendButton.settingsChanged() def updateMytitleLabels(self): """Update selections function""" self.mytitleLabels = list() for movie in self.myBasket: self.mytitleLabels.append(movie) self.mytitleLabels = self.mytitleLabels self.clearmyBasket.setDisabled(self.myBasket == list()) self.removeButton.setDisabled(self.myTitles == list()) # Create the final output with the script def sendData(self): """Send data from website springfieldspringfield""" # Skip if title list is empty: if self.myBasket == list(): self.infoBox.setText( "Your corpus is empty, please add some movies first", "warning" ) self.segmentation = None self.send("Movie transcripts", self.segmentation, self) return # Clear created Inputs. self.clearCreatedInputs() annotations = list() script_list = list() annotations_dict = dict() self.controlArea.setDisabled(True) # Initialize progress bar. progressBar = ProgressBar(self, iterations=len(self.myBasket)) # This part of code is what fetches the actual script try: for movie in self.myBasket: # Each movie that is in the corpus is split into title and year # (rsplit makes sure to only split last occurence) which will # become annotations b = copy.copy(movie) future_annotation = b.rsplit('(', 1) movie_title = future_annotation[0] movie_year = future_annotation[-1] movie_year = movie_year[:-1] annotations_dict["Movie Title"] = movie_title annotations_dict["Year of release"] = movie_year # It is important to make a copy of dictionary, otherwise each # iteration will replace every element of the annotations list annotations.append(annotations_dict.copy()) # link_end and page_url are the two variables that will have to # be changed in case scripts need to be taken from elsewhere link_end = self.path_storage[movie] page_url = "https://www.springfieldspringfield.co.uk/" + \ "movie_script.php?movie=" + link_end page = urllib.request.urlopen(page_url) soup = BeautifulSoup(page, 'html.parser') # This is what grabs the movie script script = soup.find("div", {"class":"movie_script"}) script_list.append(script.text) # 1 tick on the progress bar of the widget progressBar.advance() except: self.infoBox.setText( "Couldn't download data from SpringfieldSpringfield website.", "error" ) self.controlArea.setDisabled(False) return # Store downloaded script strings in input objects... for script in script_list: newInput = Input(script, self.captionTitle) self.createdInputs.append(newInput) # If there's only one play, the widget"s output is the created Input. if len(self.createdInputs) == 1: self.segmentation = self.createdInputs[0] # Otherwise the widget"s output is a concatenation... else: self.segmentation = Segmenter.concatenate( self.createdInputs, self.captionTitle, import_labels_as=None, ) # Annotate segments... for idx, segment in enumerate(self.segmentation): segment.annotations.update(annotations[idx]) self.segmentation[idx] = segment # Clear progress bar. progressBar.finish() self.controlArea.setDisabled(False) # Set status to OK and report data size... message = "%i segment@p sent to output " % len(self.segmentation) message = pluralize(message, len(self.segmentation)) numChars = 0 for segment in self.segmentation: segmentLength = len(Segmentation.get_data(segment.str_index)) numChars += segmentLength message += "(%i character@p)." % numChars message = pluralize(message, numChars) self.infoBox.setText(message) self.send("Movie transcripts", self.segmentation, self) self.sendButton.resetSettingsChangedFlag() def clearCreatedInputs(self): """Delete all Input objects that have been created.""" del self.createdInputs[:] def setCaption(self, title): """The following method needs to be copied verbatim in every Textable widget that sends a segmentation""" if 'captionTitle' in dir(self): changed = title != self.captionTitle super().setCaption(title) if changed: self.SendButton.settingsChanged() else: super().setCaption(title)
class WidgetEditList(OWTextableBaseWidget): """Textable widget for modifing the lexical content of the list """ #---------------------------------------------------------------------- # Widget's metadata... name = "Edit Lexical List" description = "Edit words contained in lists (lexical fields)" icon = "icons/lexical_hunter.svg" #---------------------------------------------------------------------- # Channel definitions... inputs = [("Word segmentation", Segmentation, "inputData")] outputs = [("Segmentation with annotations", Segmentation)] #---------------------------------------------------------------------- # Layout parameters... want_main_area = True #---------------------------------------------------------------------- # Settings... settingsHandler = VersionedSettingsHandler( version=__version__.rsplit(".", 1)[0]) textFieldContent = settings.Setting(u''.encode('utf-8')) encoding = settings.Setting(u'utf-8') selectedFields = [] listTitle = "" listWord = "" titleList = settings.Setting([]) baseLocation = settings.Setting('.') def __init__(self, caller): """Widget creator.""" super().__init__() # Variable to communicate with the base widjet by calling # self.creator.vriable_name self.caller = caller # Other attributes... self.inputSeg = None self.outputSeg = None # Next two instructions are helpers from TextableUtils. Corresponding # interface elements are declared here and actually drawn below (at # their position in the UI)... self.infoBox = InfoBox(widget=self.controlArea) # Temporary dictionary so that the user can cancel changes self.tempDict = defaultDict.copy() # User interface... # CONTROL AREA # # Options box for the structure titleListBox = gui.widgetBox( widget=self.controlArea, box="Lists", orientation="horizontal", ) # SAVE AREA # (After the control one but need to be first for the savechange button) SaveBox = gui.widgetBox( widget=self.controlArea, box=None, orientation="horizontal", ) self.SaveChanges = gui.button( widget=SaveBox, master=self, label="Save changes", callback=self.saveChanges, width=130, ) self.CancelChanges = gui.button( widget=SaveBox, master=self, label="Cancel", callback=self.closeWindow, width=130, ) # END OF SAVE AREA # List of Lexical list that the user can select self.titleLabelsList = gui.listBox( widget=titleListBox, master=self, value="selectedFields", # setting (list) labels="titleList", # setting (list) tooltip= "The list of lexical list that you want to use for annotation", callback=self.updateGUI, ) self.titleLabelsList.setMinimumHeight(300) self.titleLabelsList.setMinimumWidth(150) self.titleLabelsList.setSelectionMode(1) # a box for vertical align of the button controlBox = gui.widgetBox( widget=titleListBox, box=None, orientation="vertical", ) # Actions on list self.EditList = gui.button( widget=controlBox, master=self, label="Edit", callback=self.setEditContent, width=130, autoDefault=False, ) self.ImportList = gui.button( widget=controlBox, master=self, label="Import", callback=self.importLexic, width=130, autoDefault=False, ) self.ExportList = gui.button( widget=controlBox, master=self, label="Export All", callback=self.exportAllLexics, width=130, ) self.ExportSelectedList = gui.button( widget=controlBox, master=self, label="Export Selected", callback=self.exportOneLexic, width=130, ) self.NewList = gui.button( widget=controlBox, master=self, label="New", callback=self.newLexicalField, width=130, ) self.ClearList = gui.button( widget=controlBox, master=self, label="Clear all", callback=self.clearList, width=130, ) self.RemoveSelectedList = gui.button( widget=controlBox, master=self, label="Remove Selected", callback=self.deleteSelectedList, width=130, ) # MAIN AREA (edit list) # # structure ... listEditBox = gui.widgetBox( widget=self.mainArea, box="Edit", orientation="vertical", ) listEditBox.setMinimumWidth(300) # Edit the titile of the list self.titleEdit = gui.lineEdit( widget=listEditBox, master=self, value="listTitle", label="List name", orientation="vertical", ) # Editable text Field. Each line gonna be a enter of # the lexical list selected self.ContentLabel = gui.label( widget=listEditBox, master=self, label="List content", ) self.editor = QPlainTextEdit() listEditBox.layout().addWidget(self.editor) self.editor.setMinimumHeight(300) buttonEditBox = gui.widgetBox(widget=listEditBox, box=None, orientation="horizontal") # For saving the chang on the list edit self.CommitList = gui.button( widget=buttonEditBox, master=self, label="Commit", callback=self.saveEdit, width=100, ) self.CancelList = gui.button(widget=buttonEditBox, master=self, label="Cancel list changes", callback=self.cancelListChanges, width=100) gui.rubber(self.controlArea) self.setTitleList() self.updateGUI() # Now Info box and Send button must be drawn... self.infoBox.draw() # Set the window as modal self.exec() def setEditContent(self): """Sets the lexical field informations when the user wants to edit it""" # Getting selected list title self.listTitle = list(self.titleList)[self.selectedFields[0]] # Converting words list to string self.editContent = '\n'.join(self.tempDict[self.listTitle]) # Setting editor content with words list (converted to string) self.editor.setPlainText(self.editContent) # Getting old title (to delete it later if the users wants to) self.oldTitle = self.listTitle self.updateGUI() def setTitleList(self): """Displays the lexical fields titles in the edit widget view""" self.titleList = sorted(self.tempDict.keys()) def clearList(self): """Clears the list of lexical fields""" confBox = QMessageBox( QMessageBox.Question, "Textable", "Do you really want to delete all the lexical lists?", QMessageBox.Yes | QMessageBox.No) # Getting the answer of the user result = confBox.exec_() if result == QMessageBox.Yes: # Reset textfields values self.titleEdit.setText("") self.editor.setPlainText("") # Deleting all lexical fields self.tempDict.clear() self.setTitleList() else: pass def deleteSelectedList(self): """Deletes selected lexical field""" confBox = QMessageBox(QMessageBox.Question, "Textable", "Do you really want to delete this list?", QMessageBox.Yes | QMessageBox.No) # Getting the answer of the user result = confBox.exec_() if result == QMessageBox.Yes: # Getting selected list title self.listToDelete = list(self.titleList)[self.selectedFields[0]] # Reset textfields values self.titleEdit.setText("") self.editor.setPlainText("") # Deleting selected list self.tempDict.pop(self.listToDelete, None) self.titleList = sorted(self.tempDict.keys()) else: pass def newLexicalField(self): """Sets a new entry in the lexical fields dictionnary""" newDict = "New lexical field" i = 1 while newDict in self.tempDict.keys(): newDict = "New lexical field %i" % i i += 1 self.tempDict[newDict] = "" self.setTitleList() def saveEdit(self): """Saves the modifications made by the user on the list""" # Getting textfields values self.val = self.editor.toPlainText() self.newTitle = self.titleEdit.text() # Reset textfields values self.titleEdit.setText("") self.editor.setPlainText("") wordList = self.val.split("\n") self.tempDict[self.newTitle] = wordList # Deleting old key and value if self.newTitle != self.oldTitle: del self.tempDict[self.oldTitle] self.titleList = sorted(self.tempDict.keys()) self.updateGUI() def cancelListChanges(self): # Reset textfields values self.titleEdit.setText("") self.editor.setPlainText("") self.updateGUI() def saveChanges(self): """Saves changes made by the user""" defaultDict.clear() defaultDict.update(self.tempDict) self.hide() self.caller.setTitleList() def closeWindow(self): """Cancels changes made by the user""" self.hide() def importLexic(self): """Lets the user import a lexical field from a text file""" # Opening a file browser filePath = QFileDialog.getOpenFileName(self, u'Import lexical field file', self.baseLocation, u'Text files (*)') if not filePath: return self.file = os.path.normpath(filePath) self.baseLocation = os.path.dirname(filePath) # Gets txt file name and substracts .txt extension fileName = os.path.join(self.baseLocation, self.file) # Cutting the path to get the name if platform.system() == "Windows": listLexicName = fileName.split('\\') else: listLexicName = fileName.split('/') # Getting file name lexicName = listLexicName[-1] lexicName = re.sub('\.txt$', '', lexicName) # Trying to open the files and store their content in a dictionnary # then store all of theses in a list try: fileHandle = open(fileName, encoding='utf-8') content = fileHandle.readlines() # Deleting spaces self.tempDict[lexicName] = [re.sub(r'\s', "", i) for i in content] fileHandle.close() self.setTitleList() except IOError: QMessageBox.warning(None, 'Textable', "Couldn't open file.", QMessageBox.Ok) return def exportOneLexic(self): """Lets the user export the selected list to a text file""" # Opening file browser filePath = QFileDialog.getSaveFileName( self, u'Export Selected Lexical List', self.baseLocation, ) # Setting content to save exportTitle = list(self.titleList)[self.selectedFields[0]] exportContent = self.tempDict[exportTitle] # Saving lexic content if filePath: outputFile = open( filePath, encoding='utf8', mode='w+', errors='xmlcharrefreplace', ) outputFile.write('\n'.join(exportContent)) outputFile.close() QMessageBox.information(None, 'Textable', 'Lexical file correctly exported', QMessageBox.Ok) def exportAllLexics(self): """Lets the user export all the lexics""" # Opening file browser filePath = QFileDialog.getExistingDirectory(self, u'Export Selected List', self.baseLocation) if filePath: for name in self.tempDict: exportName = name.replace(" ", "_") if platform.system() == "Windows": fullName = r"{}\{}.txt".format(filePath, exportName) else: fullName = r"{}/{}.txt".format(filePath, exportName) outputFile = open( fullName, encoding='utf8', mode='w+', errors='xmlcharrefreplace', ) outputFile.write('\n'.join(self.tempDict[name])) outputFile.close() QMessageBox.information(None, 'Textable', 'Lexical files correctly exported', QMessageBox.Ok) def inputData(self, newInput): """Process incoming data.""" pass def updateGUI(self): if self.titleEdit.text() != "": # Disabled elements self.SaveChanges.setDisabled(True) self.CancelChanges.setDisabled(True) self.EditList.setDisabled(True) self.ImportList.setDisabled(True) self.ExportList.setDisabled(True) self.ExportSelectedList.setDisabled(True) self.NewList.setDisabled(True) self.ClearList.setDisabled(True) self.RemoveSelectedList.setDisabled(True) # Enabled elements self.CommitList.setDisabled(False) self.CancelList.setDisabled(False) self.editor.setDisabled(False) self.titleEdit.setDisabled(False) if self.titleEdit.text() == "": # Enabled elements self.SaveChanges.setDisabled(False) self.CancelChanges.setDisabled(False) self.EditList.setDisabled(False) self.ImportList.setDisabled(False) self.ExportList.setDisabled(False) self.ExportSelectedList.setDisabled(False) self.NewList.setDisabled(False) self.ClearList.setDisabled(False) self.RemoveSelectedList.setDisabled(False) # Disabled elements self.CommitList.setDisabled(True) self.CancelList.setDisabled(True) self.editor.setDisabled(True) self.titleEdit.setDisabled(True) if not self.selectedFields: # Disabled elements if a list isn't selected self.RemoveSelectedList.setDisabled(True) self.ExportSelectedList.setDisabled(True) self.EditList.setDisabled(True) # The following method needs to be copied verbatim in # every Textable widget that sends a segmentation... def setCaption(self, title): if 'captionTitle' in dir(self): changed = title != self.captionTitle super().setCaption(title) if changed: self.sendButton.settingsChanged() else: super().setCaption(title)
class ECP(OWTextableBaseWidget): """Textable widget for importing XML-TEI data from the Eighteenth Century Poetry website (http://www.eighteenthcenturypoetry.org/) """ #---------------------------------------------------------------------- # Widget"s metadata... name = "18th Century Poetry" description = "Import XML-TEI data from ECP website" icon = "icons/18th_century_poetry.svg" priority = 10 #---------------------------------------------------------------------- # Channel definitions (NB: no input in this case)... inputs = [] outputs = [("XML-TEI data", Segmentation)] #---------------------------------------------------------------------- # Settings... settingsHandler = VersionedSettingsHandler( version=__version__.rsplit(".", 1)[0]) autoSend = settings.Setting(False) selectedTitles = settings.Setting([]) titleLabels = settings.Setting([]) filterCriterion = settings.Setting("author") filterValue = settings.Setting("(all)") importedURLs = settings.Setting([]) displayAdvancedSettings = settings.Setting(False) want_main_area = False def __init__(self): """Widget creator.""" super().__init__() # Other attributes... self.segmentation = None self.createdInputs = list() self.titleSeg = None self.filteredTitleSeg = None self.filterValues = dict() self.base_url = \ u"http://www.eighteenthcenturypoetry.org/works/#genres" self.document_base_url = \ u"http://www.eighteenthcenturypoetry.org" # Next two instructions are helpers from TextableUtils. Corresponding # interface elements are declared here and actually drawn below (at # their position in the UI)... self.infoBox = InfoBox(widget=self.controlArea) self.sendButton = SendButton( widget=self.controlArea, master=self, callback=self.sendData, infoBoxAttribute="infoBox", sendIfPreCallback=self.updateGUI, ) # The AdvancedSettings class, also from TextableUtils, facilitates # the management of basic vs. advanced interface. An object from this # class (here assigned to self.advancedSettings) contains two lists # (basicWidgets and advancedWidgets), to which the corresponding # widgetBoxes must be added. self.advancedSettings = AdvancedSettings( widget=self.controlArea, master=self, callback=self.updateFilterValueList, ) # User interface... # Advanced settings checkbox (basic/advanced interface will appear # immediately after it... self.advancedSettings.draw() # Filter box (advanced settings only) filterBox = gui.widgetBox( widget=self.controlArea, box="Filter", orientation="vertical", ) filterCriterionCombo = gui.comboBox( widget=filterBox, master=self, value="filterCriterion", items=["author", "genre"], sendSelectedValue=True, orientation="horizontal", label="Criterion:", labelWidth=120, callback=self.updateFilterValueList, tooltip=( "Please select a criterion for searching the title list\n"), ) filterCriterionCombo.setMinimumWidth(120) gui.separator(widget=filterBox, height=3) self.filterValueCombo = gui.comboBox( widget=filterBox, master=self, value="filterValue", sendSelectedValue=True, orientation="horizontal", label="Value:", labelWidth=120, callback=self.updateTitleList, tooltip=("Please select a value for the chosen criterion."), ) gui.separator(widget=filterBox, height=3) # The following lines add filterBox (and a vertical separator) to the # advanced interface... self.advancedSettings.advancedWidgets.append(filterBox) self.advancedSettings.advancedWidgetsAppendSeparator() # Title box titleBox = gui.widgetBox( widget=self.controlArea, box="Titles", orientation="vertical", ) self.titleListbox = gui.listBox( widget=titleBox, master=self, value="selectedTitles", # setting (list) labels="titleLabels", # setting (list) callback=self.sendButton.settingsChanged, tooltip="The list of titles whose content will be imported", ) self.titleListbox.setMinimumHeight(150) self.titleListbox.setSelectionMode(3) gui.separator(widget=titleBox, height=3) gui.button( widget=titleBox, master=self, label="Refresh", callback=self.refreshTitleSeg, tooltip="Connect to ECP website and refresh list.", ) gui.separator(widget=titleBox, height=3) gui.separator(widget=self.controlArea, height=3) gui.rubber(self.controlArea) # Now Info box and Send button must be drawn... self.sendButton.draw() self.infoBox.draw() # This initialization step needs to be done after infoBox has been # drawn (because getTitleSeg may need to display an error message). self.getTitleSeg() # Send data if autoSend. self.sendButton.sendIf() self.setMinimumWidth(350) self.adjustSizeWithTimer() def sendData(self): """Compute result of widget processing and send to output""" # Skip if title list is empty: if self.titleLabels == list(): return # Check that something has been selected... if len(self.selectedTitles) == 0: self.infoBox.setText("Please select one or more titles.", "warning") self.send("XML-TEI data", None, self) return # Clear created Inputs. self.clearCreatedInputs() # Initialize progress bar. progressBar = gui.ProgressBar(self, iterations=len(self.selectedTitles)) # Attempt to connect to ECP and retrieve plays... xml_contents = list() annotations = list() try: for title in self.selectedTitles: doc_url = self.document_base_url + \ self.filteredTitleSeg[title].annotations["url"] print(doc_url) url = re.sub(r"/([^/]+)\.shtml", r"/\1/\1.xml", doc_url) print(url) response = urllib.request.urlopen(url) xml_contents.append(response.read().decode('utf-8')) source_annotations = \ self.filteredTitleSeg[title].annotations.copy() #source_annotations["url"] = source_annotations["href"] #del source_annotations["href"] annotations.append(source_annotations) progressBar.advance() # 1 tick on the progress bar... # If an error occurs (e.g. http error, or memory error)... except: #Set Info box and widget to "error" state. self.infoBox.setText("Couldn't download data from ECP website.", "error") # Reset output channel. self.send("XML-TEI data", None, self) return # Store downloaded XML in input objects... for xml_content_idx in range(len(xml_contents)): newInput = Input(xml_contents[xml_content_idx], self.captionTitle) self.createdInputs.append(newInput) # If there"s only one play, the widget"s output is the created Input. if len(self.createdInputs) == 1: self.segmentation = self.createdInputs[0] # Otherwise the widget"s output is a concatenation... else: self.segmentation = Segmenter.concatenate( self.createdInputs, self.captionTitle, import_labels_as=None, ) # Annotate segments... for idx, segment in enumerate(self.segmentation): segment.annotations.update(annotations[idx]) self.segmentation[idx] = segment # Store imported URLs as setting. self.importedURLs = [ self.filteredTitleSeg[self.selectedTitles[0]].annotations["url"] ] # Set status to OK and report data size... message = "%i segment@p sent to output " % len(self.segmentation) message = pluralize(message, len(self.segmentation)) numChars = 0 for segment in self.segmentation: segmentLength = len(Segmentation.get_data(segment.str_index)) numChars += segmentLength message += "(%i character@p)." % numChars message = pluralize(message, numChars) self.infoBox.setText(message) progressBar.finish() # Clear progress bar. progressBar.finish() # Send token... self.send("XML-TEI data", self.segmentation, self) self.sendButton.resetSettingsChangedFlag() def getTitleSeg(self): """Get title segmentation, either saved locally or online""" # Try to open saved file in this module"s directory... path = os.path.dirname( os.path.abspath(inspect.getfile(inspect.currentframe()))) try: file = open(os.path.join(path, "cached_title_list_ecp"), "rb") self.titleSeg = pickle.load(file) file.close() # Else try to load list from ECP and build new seg... except IOError: self.titleSeg = self.getTitleListFromECP() # Build author and genre lists... if self.titleSeg is not None: self.filterValues["author"] = Processor.count_in_context( units={ "segmentation": self.titleSeg, "annotation_key": "author" }).col_ids self.filterValues["author"].sort() self.filterValues["genre"] = Processor.count_in_context( units={ "segmentation": self.titleSeg, "annotation_key": "genre" }).col_ids self.filterValues["genre"].sort() # Sort the segmentation alphabetically based on titles (nasty hack!)... self.titleSeg.buffer.sort(key=lambda s: s.annotations["title"]) # Update title and filter value lists (only at init and on manual # refresh, therefore separate from self.updateGUI). self.updateFilterValueList() def refreshTitleSeg(self): """Refresh title segmentation from website""" self.titleSeg = self.getTitleListFromECP() # Update title and filter value lists (only at init and on manual # refresh, therefore separate from self.updateGUI). self.updateFilterValueList() def getTitleListFromECP(self): """Fetch titles from the ECP website""" self.infoBox.customMessage( "Fetching data from ECP website, please wait") # Attempt to connect to ECP... try: response = urllib.request.urlopen(self.base_url) base_html = response.read().decode('utf-8') self.infoBox.customMessage("Done fetching data from ECP website.") # If unable to connect (somehow)... except: # Set Info box and widget to "warning" state. self.infoBox.noDataSent(warning="Couldn't access ECP website.") # Empty title list box. self.titleLabels = list() # Reset output channel. self.send("XML-TEI data", None, self) return None # Otherwise store HTML content in LTTL Input object. base_html_seg = Input(base_html) # Remove accents from the data... recoded_seg, _ = Segmenter.recode(base_html_seg, remove_accents=True) # Extract table containing titles... genresListSeg = Segmenter.import_xml( segmentation=recoded_seg, element="ul", conditions={"id": re.compile(r"^genres-list")}, ) # Extract genre annotation... genreSeg = Segmenter.tokenize( segmentation=genresListSeg, regexes=[(re.compile(r'<a id[^>]+>(.+?)</a.+?(?=<a id|$)(?s)'), \ "tokenize", {"genre": "&1"})], import_annotations=False, ) # Extract works... titleSeg = Segmenter.tokenize( segmentation=genreSeg, regexes=[(re.compile(r'<li class="bibl".+?</span>(?s)'), \ "tokenize")], ) # Extract annotations... titleSeg = Segmenter.tokenize( segmentation=titleSeg, regexes=[ (re.compile(r"^.*>\n(.+?)</span>.*$(?s)"), "tokenize", { "author": "&1" }), (re.compile(r'^.*href="(/works/.+?\.shtml)">.*$(?s)'), "tokenize", { "url": "&1" }), (re.compile(r'^.*shtml">(.*)</a>.*$(?s)'), "tokenize", { "title": "&1" }), ], merge_duplicates=True, ) # Try to save list in this module"s directory for future reference... path = os.path.dirname( os.path.abspath(inspect.getfile(inspect.currentframe()))) try: file = open(os.path.join(path, "cached_title_list_ecp"), "wb") pickle.dump(titleSeg, file, -1) file.close() except IOError: pass # Remove warning (if any)... self.error(0) self.warning(0) return titleSeg def updateFilterValueList(self): """Update the list of filter values""" # In Advanced settings mode, populate filter value list... if self.titleSeg is not None and self.displayAdvancedSettings: self.filterValueCombo.clear() self.filterValueCombo.addItem("(all)") for filterValue in self.filterValues[self.filterCriterion]: self.filterValueCombo.addItem(filterValue) # Reset filterValue if needed... if self.filterValue not in [ self.filterValueCombo.itemText(i) for i in range(self.filterValueCombo.count()) ]: self.filterValue = "(all)" else: self.filterValue = self.filterValue self.updateTitleList() def updateTitleList(self): """Update the list of titles""" # If titleSeg has not been loaded for some reason, skip. if self.titleSeg is None: return # In Advanced settings mode, get list of selected titles... if self.displayAdvancedSettings and self.filterValue != "(all)": self.filteredTitleSeg, _ = Segmenter.select( segmentation=self.titleSeg, regex=re.compile(r"^%s$" % self.filterValue), annotation_key=self.filterCriterion, ) else: self.filteredTitleSeg = self.titleSeg # If criterion is not "genre" and his filter value not "all", # group titles with different genres... # Create a dictionary with "author" and "title" as key... unique_titles = dict() for title in self.filteredTitleSeg: title_id = ( title.annotations["author"], title.annotations["title"], ) try: unique_titles[title_id].append(title) except KeyError: unique_titles[title_id] = [title] # Create a list with new annotation comporting all genres... new_title_segments = list() for unique_title in unique_titles.values(): title_genres = list() new_title_segments.append(unique_title[0]) title_genres.append(unique_title[0].annotations["genre"]) for equivalent_title in unique_title[1:]: title_genres.append(equivalent_title.annotations["genre"]) new_title_segments[-1].annotations["genre"] = ", ".join( sorted(list(set(title_genres)))) self.filteredTitleSeg = Segmentation(None) self.filteredTitleSeg.extend(new_title_segments) # Populate titleLabels list with the titles... self.titleLabels = sorted( [s.annotations["title"] for s in self.filteredTitleSeg]) # Add specification (author, year and genre, depending on criterion)... titleLabels = self.titleLabels[:] for idx, titleLabel in enumerate(titleLabels): specs = list() if (self.displayAdvancedSettings == False or self.filterCriterion != "author" or self.filterValue == "(all)"): specs.append(self.filteredTitleSeg[idx].annotations["author"]) if (self.displayAdvancedSettings == False or self.filterCriterion != "genre" or self.filterValue == "(all)"): specs.append(self.filteredTitleSeg[idx].annotations["genre"]) titleLabels[idx] = titleLabel + " (%s)" % "; ".join(specs) self.titleLabels = titleLabels # Reset selectedTitles if needed... if not set(self.importedURLs).issubset( set(u.annotations["url"] for u in self.filteredTitleSeg)): self.selectedTitles = list() else: self.selectedTitles = self.selectedTitles self.sendButton.settingsChanged() def updateGUI(self): """Update GUI state""" if self.displayAdvancedSettings: self.advancedSettings.setVisible(True) else: self.advancedSettings.setVisible(False) if len(self.titleLabels) > 0: self.selectedTitles = self.selectedTitles def clearCreatedInputs(self): """Delete all Input objects that have been created.""" for i in self.createdInputs: Segmentation.set_data(i[0].str_index, None) del self.createdInputs[:] def onDeleteWidget(self): """Free memory when widget is deleted (overriden method)""" self.clearCreatedInputs() # The following method need to be copied (without any change) in # every Textable widget... def setCaption(self, title): if 'captionTitle' in dir(self): changed = title != self.captionTitle super().setCaption(title) if changed: self.sendButton.settingsChanged() else: super().setCaption(title)
class LexicalHunter(OWTextableBaseWidget): """Textable widget for identifying lexical fields in segments """ #---------------------------------------------------------------------- # Widget's metadata... name = "Lexical Hunter" description = "Identify words contained in lists (lexical fields)" icon = "icons/lexical_hunter.svg" priority = 22 #---------------------------------------------------------------------- # Channel definitions... inputs = [("Word segmentation", Segmentation, "inputData")] outputs = [("Segmentation with annotations", Segmentation)] #---------------------------------------------------------------------- # Layout parameters... want_main_area = False #---------------------------------------------------------------------- # Settings... settingsHandler = VersionedSettingsHandler( version=__version__.rsplit(".", 1)[0]) savedDict = settings.Setting({}) selectedFields = settings.Setting([]) autoSend = settings.Setting(False) labelName = settings.Setting("Topic") def __init__(self): """Widget creator.""" super().__init__() # Other attributes... self.inputSeg = None self.outputSeg = None self.titleLabels = [] # Put the saved dictionarys, if exist, in the global variable defaultDict if self.savedDict: defaultDict.clear() defaultDict.update(self.savedDict) # Next two instructions are helpers from TextableUtils. Corresponding # interface elements are declared here and actually drawn below (at # their position in the UI)... self.infoBox = InfoBox(widget=self.controlArea) self.sendButton = SendButton( widget=self.controlArea, master=self, callback=self.sendData, infoBoxAttribute="infoBox", sendIfPreCallback=self.updateGUI, ) # User interface... # Options box... titleLabelsList = gui.widgetBox( widget=self.controlArea, box="Click to select the lexical lists", orientation="vertical", ) # List of Lexical list that the user can select self.titleListbox = gui.listBox( widget=titleLabelsList, master=self, value="selectedFields", # setting (list) labels="titleLabels", # setting (list) callback=self.sendButton.settingsChanged, tooltip="The list of lexical list that you want\ to use for annotation", ) self.titleListbox.setMinimumHeight(150) self.titleListbox.setSelectionMode(2) # Edit a list ... self.OptionList = gui.button( widget=titleLabelsList, master=self, label="Edit lists", callback=self.editList, width=100, ) self.titleEdit = gui.lineEdit( widget=self.controlArea, master=self, value="labelName", label="Annotation key : ", orientation="horizontal", ) self.titleEdit.setPlaceholderText("Topic") gui.rubber(self.controlArea) # Now Info box and Send button must be drawn... self.sendButton.draw() self.infoBox.draw() # Show the lists in the box self.setTitleList() # Send data if autoSend. self.sendButton.sendIf() def getDefaultLists(self): """ DEPRECATED Gets default lexical lists stored in txt files """ # Seting the path of the files... __location__ = os.path.realpath( os.path.join(os.getcwd(), os.path.dirname(__file__))) if platform.system() == "Windows": __location__ += r"\lexicalfields" else: __location__ += r"/lexicalfields" # Initiations self.myContent = {} # For each txt file in the directory... for file in os.listdir(__location__): if file.endswith(".txt"): # Gets txt file name and substracts .txt extension fileName = os.path.join(__location__, file) if platform.system() == "Windows": listLexicName = fileName.split('\\') else: listLexicName = fileName.split('/') lexicName = listLexicName[-1] lexicName = re.sub('\.txt$', '', lexicName) # Trying to open the files and store their content in # a dictionnary then store all of theses in a list try: fileHandle = codecs.open(fileName, encoding='utf-8') fileContent = fileHandle.read() fileHandle.close() defaultDict[lexicName] = fileContent.split('\n') except IOError: QMessageBox.warning(None, 'Textable', "Couldn't open file.", QMessageBox.Ok) return def setTitleList(self): """Creates a list with each key of the default dictionnaries to display them on the list box Be careful, the order really matter for the selectedFields variable !""" self.titleLabels = sorted(defaultDict.keys()) # save the dictionnary used to display the list as a setting self.savedDict.clear() self.savedDict.update(defaultDict) def editList(self): """Creates edit list widget""" self.widgetEdit = WidgetEditList(self) self.widgetEdit.show() self.setTitleList() def inputData(self, newInput): """Process incoming data.""" ######### traiter inputSeg comme le segement d entree ########## self.inputSeg = newInput self.infoBox.inputChanged() self.sendButton.sendIf() def sendData(self): """Compute result of widget processing and send to output""" # An input is needed if self.inputSeg == None: self.infoBox.setText("A segmentation input is needed.", "warning") self.send("Segmentation with annotations", None, self) return # Skip if no list is selected if self.titleLabels == None: self.infoBox.setText( "You need to define at least one lexical list.", "error") self.send("Segmentation with annotations", None, self) return # A list must have been selected if len(self.selectedFields) == 0: self.infoBox.setText("Please select one or more lexical lists.", "warning") self.send("Segmentation with annotations", None, self) return # A annotation key must have been defined """ if self.labelName == "": self.infoBox.setText( "An annotation key is needed.", "warning" ) self.send("Segmentation with annotations", None, self) return """ self.huntTheLexic() # Set status to OK and report data size... message = "%i segment@p sent to output " % len(self.outputSeg) message = pluralize(message, len(self.outputSeg)) # Segmentation go to outputs... self.send("Segmentation with annotations", self.outputSeg, self) self.infoBox.setText(message) self.sendButton.resetSettingsChangedFlag() ######## NOTRE FONCTION PRINCIPALE !!! ####### def huntTheLexic(self): """ main I/O function, filters the inputSeg with the selected lexical fields and outputs a copy of the input this Segmentation with segments labelised according to the topic they belong in """ # initiations... out = list() selectedListsNames = list() # first we select the topics according to the ones the user chose if self.titleLabels: selectedListsNames = [ list(self.titleLabels)[idx] for idx in self.selectedFields ] # we can then associate the topics with their respective lists selectedLists = { key: value for key, value in defaultDict.items() if key in selectedListsNames } # if we have an input, we can select the segments of the input and # label them according to the lists they are found in if self.inputSeg is not None: for filter_list in selectedLists: work_list = [i for i in selectedLists[filter_list] if i] if work_list: out.append( Segmenter.select( self.inputSeg, self.listToRegex(work_list), label=filter_list, )[0]) # lastly we define the output as a segmentation that is a copy of # the input, with the segments that we found labeled accordingly if self.labelName == "": labelNameVar = "Topic" else: labelNameVar = self.labelName self.outputSeg = Segmenter.concatenate( [Segmenter.bypass(self.inputSeg, label="__None__")] + out, merge_duplicates=True, label=self.captionTitle, import_labels_as=labelNameVar, ) def updateGUI(self): """Update GUI state""" if len(self.titleLabels) > 0: self.selectedFields = self.selectedFields # The following method needs to be copied verbatim in # every Textable widget that sends a segmentation... def setCaption(self, title): if 'captionTitle' in dir(self): changed = title != self.captionTitle super().setCaption(title) if changed: self.sendButton.settingsChanged() else: super().setCaption(title) #An eventually useful function, set aside for the moment def listToRegex(self, list): """ Takes a list and turns it into a regex that matches any elements within it """ regexString = "^(" + "|".join(list) + ")$" exitRegex = re.compile(regexString, re.IGNORECASE) return exitRegex
class Linguistica(OWTextableBaseWidget): """Textable widget for unsupervised morphology learning, using the "Crab Nebula" algorithm from John Golsdmith's Linguistica """ #---------------------------------------------------------------------- # Widget's metadata... name = "Linguistica" description = "Unupervised morphological analysis" icon = "icons/linguistica.svg" priority = 21 #---------------------------------------------------------------------- # Channel definitions... inputs = [("Word segmentation", Segmentation, "inputData")] outputs = [("Morphologically analyzed data", Segmentation)] #---------------------------------------------------------------------- # Layout parameters... want_main_area = True #---------------------------------------------------------------------- # Settings... settingsHandler = VersionedSettingsHandler( version=__version__.rsplit(".", 1)[0]) autoSend = settings.Setting(False) minStemLen = settings.Setting(3) maxSuffixLen = settings.Setting(4) def __init__(self): """Widget creator.""" super().__init__() # Other attributes... self.inputSeg = None self.morphology = dict() self.selectedMainWord = None self.mainWords = list() self.selectedParse = None self.parses = list() self.selectedStemForParse = None self.stemsForParse = list() self.selectedSuffixForParse = None self.suffixesForParse = list() self.selectedMainSignature = None self.mainSignatures = list() self.wordsForSig = list() self.stemsForSig = list() self.suffixesForSig = list() # Next two instructions are helpers from TextableUtils. Corresponding # interface elements are declared here and actually drawn below (at # their position in the UI)... self.infoBox = InfoBox(widget=self.controlArea) self.sendButton = SendButton( widget=self.controlArea, master=self, callback=self.sendData, infoBoxAttribute="infoBox", sendIfPreCallback=None, ) # User interface... # A) Control area... # Options box... optionsBox = gui.widgetBox( widget=self.controlArea, box="Options", orientation="vertical", ) gui.spin( widget=optionsBox, master=self, value='minStemLen', label='Minimum length of stems: ', callback=self.sendButton.sendIf, labelWidth=180, tooltip=( 'Select the minimum number of required characters in stems'), minv=LOWER_MIN_STEM_LEN, maxv=MAX_MORPH_LEN, step=1, ) gui.separator(widget=optionsBox, height=2) gui.rubber(self.controlArea) # B) Main area... font = QFont() font.setFamily('Courier') font.setStyleHint(QFont.Courier) font.setPixelSize(12) # Tabs... self.tabs = QTabWidget() self.wordTab = QWidget() self.signatureTab = QWidget() self.tabs.addTab(self.wordTab, "Words") self.tabs.addTab(self.signatureTab, "Signatures") # Words tab... wordTabBox = QHBoxLayout() wordBox = gui.widgetBox( widget=self.wordTab, orientation="horizontal", margin=5, ) wordBoxRight = gui.widgetBox(widget=wordBox) self.mainWordListbox = gui.listBox( widget=wordBoxRight, master=self, value="selectedMainWord", labels="mainWords", callback=self.mainWordSelected, tooltip="Select a word to display its possible parses.", ) self.mainWordListbox.setFont(font) gui.separator(widget=wordBox, width=3) wordBoxLeft = gui.widgetBox(widget=wordBox) gui.label( widget=wordBoxLeft, master=self, label="Parse(s):", ) self.parsesListbox = gui.listBox( widget=wordBoxLeft, master=self, value="selectedParse", labels="parses", callback=self.parseSelected, tooltip="Select a parse to display the corresponding signature.", ) self.parsesListbox.setFont(font) self.sigForParseBox = gui.widgetBox( widget=wordBoxLeft, box="Signature", ) gui.label( widget=self.sigForParseBox, master=self, label="Stem(s):", ) self.stemsForParseListbox = gui.listBox( widget=self.sigForParseBox, master=self, labels="stemsForParse", tooltip="Stems associated with the parse selected above.", ) gui.separator(widget=self.sigForParseBox, height=2) gui.label( widget=self.sigForParseBox, master=self, label="Suffixes(s):", ) self.suffixesForParseListbox = gui.listBox( widget=self.sigForParseBox, master=self, labels="suffixesForParse", tooltip="Suffixes associated with the parse selected above.", ) wordTabBox.addWidget(wordBox) self.wordTab.setLayout(wordTabBox) # Signature tab... signatureTabBox = QHBoxLayout() signatureBox = gui.widgetBox( widget=self.signatureTab, orientation="horizontal", margin=5, ) signatureBoxRight = gui.widgetBox(widget=signatureBox) self.mainSignatureListbox = gui.listBox( widget=signatureBoxRight, master=self, value="selectedMainSignature", labels="mainSignatures", callback=self.mainSignatureSelected, tooltip="Select a signature to display its contents.", ) self.mainSignatureListbox.setFont(font) gui.separator(widget=signatureBox, width=3) signatureBoxLeft = gui.widgetBox(widget=signatureBox) gui.label( widget=signatureBoxLeft, master=self, label="Words:", ) self.wordsForSigListbox = gui.listBox( widget=signatureBoxLeft, master=self, labels="wordsForSig", tooltip="Words associated with the selected signature.", ) self.wordsForSigListbox.setFont(font) gui.label( widget=signatureBoxLeft, master=self, label="Stem(s):", ) self.stemsForSigListbox = gui.listBox( widget=signatureBoxLeft, master=self, labels="stemsForSig", tooltip="Stems associated with the selected signature.", ) self.stemsForSigListbox.setFont(font) gui.label( widget=signatureBoxLeft, master=self, label="Suffixes(s):", ) self.suffixesForSigListbox = gui.listBox( widget=signatureBoxLeft, master=self, labels="suffixesForSig", tooltip="Suffixes associated with the selected signature.", ) self.suffixesForSigListbox.setFont(font) signatureTabBox.addWidget(signatureBox) self.signatureTab.setLayout(signatureTabBox) self.mainArea.layout().addWidget(self.tabs) # Now Info box and Send button must be drawn... self.sendButton.draw() self.infoBox.draw() self.infoBox.setText("Widget needs input", "warning") self.setMinimumWidth(602) self.setMinimumHeight(317) self.adjustSizeWithTimer() # Send data if autoSend. self.sendButton.sendIf() def inputData(self, newInput): """Process incoming data.""" self.inputSeg = newInput self.infoBox.inputChanged() self.sendButton.sendIf() def mainSignatureSelected(self): """Display selected signature and generated words.""" # Return if no selected signature... if len(self.selectedMainSignature) == 0: self.wordsForSig = list() return # Get generated words (by decreasing frequency)... sigs = self.morphology["signatures"] if self.selectedMainSignature[0] == 0: words = sorted([ w for w in self.morphology["wordCounts"].keys() if self.morphology["parser"][w][0].signature == 0 ]) else: su = list(sigs.keys())[self.selectedMainSignature[0] - 1] words = ["".join(pair) for pair in itertools.product(sigs[su], su)] words.sort(key=self.morphology["wordCounts"].get, reverse=True) # Display generated words... max_count = self.morphology["wordCounts"][words[0]] padding = len(str(max_count)) + 1 self.wordsForSig = [ '{num: {width}} {word}'.format( num=self.morphology["wordCounts"][word], width=padding, word=word, ) for word in words ] # Display stems and suffixes in signature... if self.selectedMainSignature[0] > 0: suffixes = list(sigs.keys())[self.selectedMainSignature[0] - 1] self.suffixesForSig = [suffix or "NULL" for suffix in suffixes] self.stemsForSig = sigs[suffixes] else: self.suffixesForSig = ["NULL"] self.stemsForSig = sorted(words[:]) def mainWordSelected(self): """Display possible parses for selected word.""" self.sigForParseBox.setTitle(" Signature ") # Return if no selected word... if len(self.selectedMainWord) == 0: self.parses = list() return # Get selected word's parses... words = list(self.morphology["wordCounts"].keys()) words.sort(key=self.morphology["wordCounts"].get, reverse=True) parses = self.morphology["parser"][words[self.selectedMainWord[0]]] # Display parses... self.parses = [ '{score:.2f} {stem} + {suffix}'.format( score=parse.score, stem=parse.stem, suffix=parse.suffix if parse.suffix else "NULL", ) for parse in parses ] self.selectedParse = [0] self.parseSelected() def parseSelected(self): """Display selected parse's signature.""" # Return if no selected parse... if len(self.selectedParse) == 0: self.stemsForParse = list() self.suffixesForParse = list() return # Get selected parse's signature... words = list(self.morphology["wordCounts"].keys()) words.sort(key=self.morphology["wordCounts"].get, reverse=True) parses = self.morphology["parser"][words[self.selectedMainWord[0]]] parse = parses[self.selectedParse[0]] sigNum = parse.signature # Display stems and suffixes in parse's signature... if sigNum > 0: self.sigForParseBox.setTitle(" Signature {} ".format(sigNum)) signatures = list(self.morphology["signatures"].keys()) self.suffixesForParse = [ suffix or "NULL" for suffix in signatures[sigNum - 1] ] self.stemsForParse = \ self.morphology["signatures"][signatures[sigNum-1]] else: self.sigForParseBox.setTitle(" Signature 0 ") self.suffixesForParse = ["NULL"] self.stemsForParse = sorted([ w for w in words if self.morphology["parser"][w][0].signature == 0 ]) def sendData(self): """Compute result of widget processing and send to output""" # Clear morphology... self.morphology = dict() # Check that there's an input... if self.inputSeg is None: self.infoBox.setText("Widget needs input", "warning") self.send("Morphologically analyzed data", None, self) self.updateGUI() return # Perform morphological analysis... # Initialize progress bar. self.infoBox.setText( u"Processing, please wait (word count)...", "warning", ) self.controlArea.setDisabled(True) progressBar = ProgressBar(self, iterations=100) # Word count... wordCounts = collections.Counter( [segment.get_content() for segment in self.inputSeg]) self.morphology["wordCounts"] = wordCounts self.infoBox.setText( u"Processing, please wait (signature extraction)...", "warning", ) progressBar.advance(5) # 5 ticks on the progress bar... # Learn signatures... try: lxa5crab.crab_nebula.MIN_STEM_LEN = self.minStemLen signatures, stems, suffixes = lxa5crab.find_signatures(wordCounts) self.morphology["signatures"] = signatures self.morphology["stems"] = stems self.morphology["suffixes"] = suffixes except ValueError as e: self.infoBox.setText(e.__str__(), "warning") self.send("Morphologically analyzed data", None, self) self.controlArea.setDisabled(False) progressBar.finish() # Clear progress bar. self.morphology = dict() self.updateGUI() return self.infoBox.setText( u"Processing, please wait (word parsing)...", "warning", ) progressBar.advance(80) # Parse words... parser = lxa5crab.build_parser(wordCounts, signatures, stems, suffixes) self.morphology["parser"] = parser newSegments = list() num_analyzed_words = 0 for segment in self.inputSeg: parses = parser[segment.get_content()] newSegment = segment.deepcopy() if parses[0].signature: num_analyzed_words += 1 newSegment.annotations.update( { "stem": parses[0].stem, "suffix": parses[0].suffix \ if len(parses[0].suffix) else "NULL", "signature": parses[0].signature } ) newSegments.append(newSegment) self.send( "Morphologically analyzed data", Segmentation(newSegments, self.captionTitle), self, ) self.updateGUI() progressBar.advance(15) # Set status to OK and report data size... message = "%i segment@p sent to output (%.2f%% analyzed)." % (len( self.inputSeg), (num_analyzed_words / len(self.inputSeg) * 100)) message = pluralize(message, len(self.inputSeg)) self.infoBox.setText(message) # Clear progress bar. progressBar.finish() self.controlArea.setDisabled(False) self.sendButton.resetSettingsChangedFlag() def updateGUI(self): """Update GUI state""" # Empty lists... self.mainWords = list() self.parses = list() self.stemsForParse = list() self.suffixesForParse = list() self.sigForParseBox.setTitle(" Signature ") self.mainSignatures = list() self.wordsForSig = list() self.stemsForSig = list() self.suffixesForSig = list() # Fill main lists if necessary... if len(self.morphology): # Main word list... words = list(self.morphology["wordCounts"].keys()) words.sort(key=self.morphology["wordCounts"].get, reverse=True) max_count = self.morphology["wordCounts"][words[0]] padding = len(str(max_count)) + 1 self.mainWords = [ '{num: {width}} {word}'.format( num=self.morphology["wordCounts"][word], width=padding, word=word, ) for word in words ] # Main signature list... sigs = [["NULL"]] + list(self.morphology["signatures"].keys()) padding = len(str(len(sigs))) + 1 self.mainSignatures = [ '{num: {width}} {sig}'.format( num=idx, width=padding, sig=", ".join([suff or "NULL" for suff in sig])) for idx, sig in enumerate(sigs) ] # The following method needs to be copied verbatim in # every Textable widget that sends a segmentation... def setCaption(self, title): if 'captionTitle' in dir(self): changed = title != self.captionTitle super().setCaption(title) if changed: self.sendButton.settingsChanged() else: super().setCaption(title)
class MovieReviews(OWTextableBaseWidget): """An orange widget to get movie reviews from Imdb""" #---------------------------------------------------------------------- # Widget's metadata... name = "Movie Reviews" description = "Get movie reviews from imdb" icon = "" priority = 15 #---------------------------------------------------------------------- # Channel definitions... inputs = [] outputs = [("Segmentation", Segmentation)] #---------------------------------------------------------------------- # Handles the futur versions settings settingsHandler = VersionedSettingsHandler( version=__version__.rsplit(".", 1)[0]) # Settings autoSend = settings.Setting(True) myBasket = settings.Setting([]) #---------------------------------------------------------------------- # GUI layout parameters... want_main_area = False def __init__(self): super().__init__() # Search filters attributs self.newQuery = '' self.type_results = 'Title' self.filter_results = 'Popularity' self.nbr_results = '10' # Results box attributs self.titleLabels = list() self.selectedTitles = list() # selections box attributs self.myTitles = list() self.mytitleLabels = list() self.createdInputs = list() # Mandatory declaration of the info box and the send button self.infoBox = InfoBox(widget=self.controlArea) self.sendButton = SendButton( widget=self.controlArea, master=self, callback=self.sendData, infoBoxAttribute='infoBox', sendIfPreCallback=None, ) #---------------------------------------------------------------------- # User interface... # Creation of the different working areas queryBox = gui.widgetBox( widget=self.controlArea, box="Search movies", orientation="horizontal", ) filterBox = gui.widgetBox( widget=self.controlArea, box="Filters", orientation="horizontal", ) searchButtonBox = gui.widgetBox( widget=self.controlArea, orientation="vertical", ) resultBox = gui.widgetBox( widget=self.controlArea, box="Search results", orientation="vertical", ) resultButtonBox = gui.widgetBox( widget=resultBox, box=False, orientation='horizontal', ) corpusBox = gui.widgetBox( widget=self.controlArea, box="Corpus", orientation="vertical", ) corpusButtonBox = gui.widgetBox( widget=corpusBox, box=False, orientation='horizontal', ) # Allows to enter specific text to the research gui.lineEdit( widget=queryBox, master=self, value='newQuery', orientation='horizontal', label=u"Search: ", labelWidth=120, tooltip=("Enter a string"), ) # Allows to choose a type of search searchType = gui.comboBox( widget=queryBox, master=self, value="type_results", items=[ "Title", "Actor", "Genre", ], sendSelectedValue=True, orientation="horizontal", label="Search Type: ", labelWidth=120, tooltip=("Please select the desired search.\n"), ) # Allows to chose a filter for the search searchFilter = gui.comboBox( widget=filterBox, master=self, value="filter_results", items=[ "Popularity", "Alphabetical", "Random", ], sendSelectedValue=True, orientation="horizontal", label="Search by: ", labelWidth=120, tooltip=("Please select the desired search.\n"), ) # Allows to choose the wanted results numberp (10 by 10) searchNbr = gui.comboBox( widget=filterBox, master=self, value="nbr_results", items=[ "10", "20", "30", "40", "50", ], sendSelectedValue=True, orientation="horizontal", label="Number of results: ", labelWidth=120, tooltip=("Please select the desired search.\n"), ) # Reasearch button self.searchButton = gui.button( widget=searchButtonBox, master=self, label="Search", callback=self.searchMovies, tooltip="Connect to imdbpy and make a research", ) # List Box where all the searched movies are stocked self.titleListbox = gui.listBox( widget=resultBox, master=self, value="selectedTitles", labels="titleLabels", callback=None, tooltip="The list of titles whose content will be imported", ) self.titleListbox.setMinimumHeight(150) self.titleListbox.setSelectionMode(3) # Add movies button self.addButton = gui.button( widget=resultButtonBox, master=self, label=u'Add to corpus', callback=self.addToCorpus, tooltip=(u"Move the selected movie downward in your corpus."), ) self.addButton.setDisabled(True) # Clear button self.clearButton = gui.button( widget=resultButtonBox, master=self, label="Clear results", callback=self.clearResults, tooltip="Clear results", ) self.clearButton.setDisabled(True) #gui.separator(widget=queryBox, height=3) # Corpus where confirmed movies are moved and stocked self.mytitleListbox = gui.listBox( widget=corpusBox, master=self, value="myTitles", labels="mytitleLabels", callback=lambda: self.removeButton.setDisabled(self.myTitles == list()), tooltip="The list of titles whose content will be imported", ) self.mytitleListbox.setMinimumHeight(150) self.mytitleListbox.setSelectionMode(3) # Remove movie button self.removeButton = gui.button( widget=corpusButtonBox, master=self, label=u'Remove from corpus', callback=self.remove, tooltip=(u"Remove the selected movie from your corpus."), ) self.removeButton.setDisabled(True) # Delete all confirmed movies button self.clearmyBasket = gui.button( widget=corpusButtonBox, master=self, label=u'Clear corpus', callback=self.clearCorpus, tooltip=(u"Remove all movies from your corpus."), ) self.clearmyBasket.setDisabled(True) gui.separator(widget=corpusBox, height=3) gui.rubber(self.controlArea) #---------------------------------------------------------------------- # Draw Info box and Send button self.sendButton.draw() self.searchButton.setDefault(True) self.infoBox.draw() # Send data if autoSend. self.sendButton.sendIf() def searchMovies(self): """Search from imdb movie database""" result_list = {} query_string = self.newQuery if query_string != "": counter = 1 counter_max = int(self.nbr_results) result_id = 0 result_artist = [] self.controlArea.setDisabled(True) # Initialize progress bar progressBar = ProgressBar(self, iterations=counter_max) ia = imdb.IMDb() # movie name name = query_string # searching the movie search = ia.search_movie(name) print(search) # Each result is stored in a dictionnary with its title # and year of publication if it is specified for result in search: if counter <= counter_max: #print(counter) #print(counter_max) try: result_id += 1 year = result['year'] movie_id = result.movieID result_list[result_id] = { 'name': result, 'year': year, 'id': movie_id } except KeyError: result_id += 1 result_list[result_id] = { 'name': result, } counter += 1 else: break # 1 tick on the progress bar of the widget progressBar.advance() # Stored the results list in the "result_list" variable self.searchResults = result_list # Reset and clear the visible widget list del self.titleLabels[:] # Update the results list with the search results # in order to display them for idx in self.searchResults: try: result_string = f'{self.searchResults[idx]["name"]} - {self.searchResults[idx]["year"]}' self.titleLabels.append(result_string) except KeyError: result_string = f'{self.searchResults[idx]["name"]}' self.titleLabels.append(result_string) self.titleLabels = self.titleLabels self.clearButton.setDisabled(False) self.addButton.setDisabled(False) # Clear progress bar. progressBar.finish() self.controlArea.setDisabled(False) else: self.infoBox.setText("Please enter a movie title", "warning") # Add movie to corpus def addToCorpus(self): """Add movies in your selection """ for selectedTitle in self.selectedTitles: newMovie = self.searchResults[selectedTitle + 1] if newMovie not in self.myBasket: self.myBasket.append(newMovie) self.updateCorpus() self.sendButton.settingsChanged() # Make the movie appear in the corpus Listbox def updateCorpus(self): """Update the corpus box list in order to view the movies added""" self.mytitleLabels = list() for newMovie in self.myBasket: try: result_string = f'{newMovie["name"]} - {newMovie["year"]}' self.mytitleLabels.append(result_string) except KeyError: result_string = newMovie["name"] self.mytitleLabels.append(result_string) self.mytitleLabels = self.mytitleLabels self.clearmyBasket.setDisabled(self.myBasket == list()) self.removeButton.setDisabled(self.myTitles == list()) def remove(self): """Remove the selected movie in the corpus """ self.myBasket = [ movie for idx, movie in enumerate(self.myBasket) if idx not in self.myTitles ] self.updateCorpus() self.sendButton.settingsChanged() # Remove movies function def sendData(self): """Compute result of widget processing and send to output""" # Skip if title list is empty: if self.myBasket == list(): self.infoBox.setText( "Your corpus is empty, please add some movies first", "warning") return # Clear created Inputs. self.clearCreatedInputs() self.controlArea.setDisabled(True) # Initialize progress bar. progressBar = ProgressBar(self, iterations=len(self.myBasket)) # Attempt to connect to Genius and retrieve lyrics... selectedSongs = list() list_review = list() annotations = list() try: for item in self.myBasket: ia = imdb.IMDb() movie = ia.get_movie_reviews(item['id']) list_review.append(movie) # 1 tick on the progress bar of the widget progressBar.advance() # If an error occurs (e.g. http error, or memory error)... except: # Set Info box and widget to "error" state. self.infoBox.setText("Couldn't download data from imdb", "error") self.controlArea.setDisabled(False) return # Store movie critics strings in input objects... for movie in list_review: #for key, value in movie.items(): #try: data = movie.get('data', "") reviews_data = data.get('reviews') for review in reviews_data: reviews = review.get('content') newInput = Input(reviews) self.createdInputs.append(newInput) new_dict = review.copy() annotations.append(new_dict) """ except: self.infoBox.setText( "The movie has no associated reviews", "warning" ) self.controlArea.setDisabled(False) return """ # If there's only one item, the widget's output is the created Input. if len(self.createdInputs) == 1: self.segmentation = self.createdInputs[0] # Otherwise the widget's output is a concatenation... else: self.segmentation = Segmenter.concatenate( self.createdInputs, import_labels_as=None, ) # Annotate segments... for idx, segment in enumerate(self.segmentation): segment.annotations.update(annotations[idx]) self.segmentation[idx] = segment # Clear progress bar. progressBar.finish() self.controlArea.setDisabled(False) # Set status to OK and report data size... message = f"{len(self.segmentation)} segment@p sent to output" message = pluralize(message, len(self.segmentation)) numChars = 0 for segment in self.segmentation: segmentLength = len(Segmentation.get_data(segment.str_index)) numChars += segmentLength message += "(%i character@p)." % numChars message = pluralize(message, numChars) self.infoBox.setText(message) self.send('Segmentation', self.segmentation, self) self.sendButton.resetSettingsChangedFlag() def clearResults(self): """Clear the results list""" del self.titleLabels[:] self.titleLabels = self.titleLabels self.clearButton.setDisabled(True) def clearCreatedInputs(self): """Delete all Input objects that have been created.""" for i in self.createdInputs: Segmentation.set_data(i[0].str_index, None) del self.createdInputs[:] def clearCorpus(self): """Remove all movies in the corpus""" self.mytitleLabels = list() self.myBasket = list() self.sendButton.settingsChanged() self.clearmyBasket.setDisabled(True)
class MovieReviews(OWTextableBaseWidget): """An orange widget to get movie reviews from Imdb""" #---------------------------------------------------------------------- # Widget's metadata... name = "Movie Reviews" description = "Get movie reviews from imdb" icon = "" priority = 15 #---------------------------------------------------------------------- # Channel definitions... inputs = [] outputs = [("Segmentation", Segmentation)] #---------------------------------------------------------------------- # Handles the futur versions settings settingsHandler = VersionedSettingsHandler( version=__version__.rsplit(".", 1)[0]) # Settings autoSend = settings.Setting(False) myBasket = settings.Setting([]) #---------------------------------------------------------------------- # GUI layout parameters... want_main_area = False def __init__(self): super().__init__() # Search filters attributs self.newQuery = '' self.type_results = 'Title' #self.genre_searched = 'Comedy' self.filter_results = 'Popularity' self.nbr_results = '10' # Results box attributs self.titleLabels = list() self.selectedTitles = list() # selections box attributs self.myTitles = list() self.mytitleLabels = list() # stocks the imdbpy instance self.ia = imdb.IMDb() # stock all the inputs (movie names) in a list self.createdInputs = list() # Mandatory declaration of the info box and the send button self.infoBox = InfoBox(widget=self.controlArea) self.sendButton = SendButton( widget=self.controlArea, master=self, callback=self.sendData, infoBoxAttribute='infoBox', sendIfPreCallback=None, ) #---------------------------------------------------------------------- # User interface... # Creation of the different working areas self.queryBox = gui.widgetBox( widget=self.controlArea, box="Query", orientation="horizontal", ) self.genreBox = gui.widgetBox( widget=self.controlArea, box="Query", orientation="horizontal", ) self.filterBox = gui.widgetBox( widget=self.controlArea, box="Query options", orientation="horizontal", ) searchButtonBox = gui.widgetBox( widget=self.controlArea, orientation="vertical", ) resultBox = gui.widgetBox( widget=self.controlArea, box="Search results", orientation="vertical", ) # List Box where all the searched movies are stocked self.titleListbox = gui.listBox( widget=resultBox, master=self, value="selectedTitles", labels="titleLabels", callback=lambda: self.addButton.setDisabled(self.selectedTitles == list()), tooltip="The list of titles whose content will be imported", ) self.titleListbox.doubleClicked.connect(self.addToCorpus) self.titleListbox.setMinimumHeight(150) self.titleListbox.setSelectionMode(3) resultButtonBox = gui.widgetBox( widget=resultBox, box=False, orientation='horizontal', ) corpusBox = gui.widgetBox( widget=self.controlArea, box="Corpus", orientation="vertical", ) # Corpus where confirmed movies are moved and stocked self.mytitleListbox = gui.listBox( widget=corpusBox, master=self, value="myTitles", labels="mytitleLabels", callback=lambda: self.removeButton.setDisabled(self.myTitles == list()), tooltip="The list of titles whose content will be imported", ) self.mytitleListbox.doubleClicked.connect(self.remove) self.mytitleListbox.setMinimumHeight(150) self.mytitleListbox.setSelectionMode(3) corpusButtonBox = gui.widgetBox( widget=corpusBox, box=False, orientation='horizontal', ) # Allows to enter specific text to the research gui.lineEdit( widget=self.queryBox, master=self, value='newQuery', orientation='horizontal', label=u"Search: ", labelWidth=120, tooltip=("Enter a string"), ) # Allows to choose a type of search searchType = gui.comboBox( widget=self.queryBox, master=self, value="type_results", items=[ "Title", "Actor", #"Genre", ], sendSelectedValue=True, callback=self.mode_changed, orientation="horizontal", label="Search by: ", labelWidth=120, tooltip=("Please select the desired search.\n"), ) """genreTypes = gui.comboBox( widget=self.genreBox, master=self, value="genre_searched", items=[ "Comedy", "Action", "Drama", "Horror", ], sendSelectedValue=True, orientation="horizontal", label="Search Type: ", labelWidth=120, tooltip=( "Please select the desired search.\n" ), ) """ searchTypeGenre = gui.comboBox( widget=self.genreBox, master=self, value="type_results", items=[ "Title", "Actor", "Genre", ], sendSelectedValue=True, callback=self.mode_changed, orientation="horizontal", label="Search Type: ", labelWidth=120, tooltip=("Please select the desired search.\n"), ) # Allows to chose a filter for the search self.searchFilter = gui.comboBox( widget=self.filterBox, master=self, value="filter_results", items=[ "Year", "Alphabetical", "Random", ], sendSelectedValue=True, orientation="horizontal", label="Sort by: ", labelWidth=120, tooltip=("Please select the desired search.\n"), ) # Allows to choose the wanted results numberp (10 by 10) self.searchNbr = gui.comboBox( widget=self.filterBox, master=self, value="nbr_results", items=[ "10", "20", "30", "40", "50", ], sendSelectedValue=True, orientation="horizontal", label="Results' number: ", labelWidth=120, tooltip=("Please select the desired search.\n"), ) # Reasearch button # Uses "searchMovies" attribute self.searchButton = gui.button( widget=searchButtonBox, master=self, label="Search", callback=self.searchMovies, tooltip="Connect to imdbpy and make a research", ) # Add movies button self.addButton = gui.button( widget=resultButtonBox, master=self, label=u'Add to corpus', callback=self.addToCorpus, tooltip=(u"Move the selected movie downward in your corpus."), ) self.addButton.setDisabled(True) # Clear button # Uses "clearResults" function self.clearButton = gui.button( widget=resultButtonBox, master=self, label="Clear results", callback=self.clearResults, tooltip="Clear results", ) self.clearButton.setDisabled(True) #gui.separator(widget=queryBox, height=3) # Remove movie button self.removeButton = gui.button( widget=corpusButtonBox, master=self, label=u'Remove from corpus', callback=self.remove, tooltip=(u"Remove the selected movie from your corpus."), ) self.removeButton.setDisabled(True) # Delete all confirmed movies button self.clearmyBasket = gui.button( widget=corpusButtonBox, master=self, label=u'Clear corpus', callback=self.clearCorpus, tooltip=(u"Remove all movies from your corpus."), ) self.clearmyBasket.setDisabled(True) gui.separator(widget=corpusBox, height=3) gui.rubber(self.controlArea) #---------------------------------------------------------------------- # Draw Info box and Send button self.sendButton.draw() self.searchButton.setDefault(True) self.infoBox.draw() #self.mode_changed() self.updateCorpus() self.mode_changed() # Send data if autoSend. self.sendButton.sendIf() def mode_changed(self): self.sendButton.settingsChanged() if self.type_results == "Title": # 0 = subreddit selected # Hide Genre box self.genreBox.setVisible(False) self.queryBox.setVisible(True) self.filterBox.setVisible(True) #self.searchFilter.setVisible(False) self.searchFilter.setDisabled(True) self.searchNbr.setVisible(True) elif self.type_results == "Genre": # Hide Title self.queryBox.setVisible(False) self.genreBox.setVisible(True) self.filterBox.setVisible(True) #self.searchFilter.setVisible(True) self.searchFilter.setDisabled(False) self.searchNbr.setVisible(True) elif self.type_results == "Actor": # searchFilter disabled self.queryBox.setVisible(True) self.genreBox.setVisible(False) self.filterBox.setVisible(True) #self.searchFilter.setVisible(True) self.searchFilter.setDisabled(False) self.searchNbr.setVisible(True) return def searchMovies(self): """Search from imdb movie database""" result_list = {} query_string = self.newQuery if query_string != "": counter = 1 counter_max = int(self.nbr_results) result_id = 0 self.controlArea.setDisabled(True) # Initialize progress bar progressBar = ProgressBar(self, iterations=counter_max) filtered_results = list() if self.type_results == 'Title': # movie name movie_name = query_string # searching the movie search = self.ia.search_movie(movie_name) for film in search: if 'year' in film: filtered_results.append(film) elif self.type_results == 'Actor': actor_name = query_string people = self.ia.search_person(actor_name) searched_actor = people[0].personID first_search = self.ia.get_person_filmography(searched_actor) # Checks if the user input is a valid actor/actress if 'actor' in first_search['data']['filmography']: search = first_search['data']['filmography']['actor'] elif 'actress' in first_search['data']['filmography']: search = first_search['data']['filmography']['actress'] else: search = list() self.infoBox.setText( "Please enter a valid actor or actress name", "warning") # Checks if the movie has a year associated and stores it in a list filtered_results = [film for film in search if 'year' in film] if self.filter_results == 'Random': random.shuffle(filtered_results) elif self.filter_results == 'Alphabetical': alpha_dict = dict() for result in filtered_results: my_id = result.movieID alpha_dict[str(result)] = my_id print(alpha_dict) sorted_dict = sorted(alpha_dict.keys(), key=lambda x: x.lower()) print(sorted_dict) filtered_results = list() for i in sorted_dict: value = alpha_dict[i] print(value) print(self.ia.get_movie(value)) filtered_results.append(self.ia.get_movie(value)) # Each result is stored in a dictionnary with its title # and year of publication if it is specified for result in filtered_results: if counter <= counter_max: try: result_id += 1 year = result['year'] movie_id = result.movieID result_list[result_id] = { 'name': result, 'year': year, 'id': movie_id } except KeyError: continue counter += 1 else: break # 1 tick on the progress bar of the widget progressBar.advance() # Stored the results list in the "result_list" variable self.searchResults = result_list # Reset and clear the visible widget list del self.titleLabels[:] # Update the results list with the search results # in order to display them for idx in self.searchResults: result_string = f'{self.searchResults[idx]["name"]} - {self.searchResults[idx]["year"]}' self.titleLabels.append(result_string) self.titleLabels = self.titleLabels self.clearButton.setDisabled(False) self.addButton.setDisabled(self.selectedTitles == list()) # Clear progress bar. progressBar.finish() self.controlArea.setDisabled(False) else: self.infoBox.setText("Please type something in the search bar", "warning") # Add movie to corpus def addToCorpus(self): """Add movies in your selection """ cond_list = list() for selectedTitle in self.selectedTitles: newMovie = self.searchResults[selectedTitle + 1] if newMovie not in self.myBasket: # Test if the movie has review associated, if not it refuses to add it to corpus movie = self.ia.get_movie_reviews(newMovie['id']) cond_list.append(movie) for movie in cond_list: data = movie.get('data', "") if 'reviews' in data: self.myBasket.append(newMovie) else: self.infoBox.setText( "Cannot add to corpus. One or more selected movies have no associated reviews", "warning") return self.updateCorpus() self.sendButton.settingsChanged() # Make the movie appear in the corpus Listbox def updateCorpus(self): """Update the corpus box list in order to view the movies added""" self.mytitleLabels = list() for newMovie in self.myBasket: result_string = f'{newMovie["name"]} - {newMovie["year"]}' self.mytitleLabels.append(result_string) self.mytitleLabels = self.mytitleLabels self.clearmyBasket.setDisabled(self.myBasket == list()) self.removeButton.setDisabled(self.myTitles == list()) def remove(self): """Remove the selected movie in the corpus """ self.myBasket = [ movie for idx, movie in enumerate(self.myBasket) if idx not in self.myTitles ] self.updateCorpus() self.sendButton.settingsChanged() # Remove movies function def sendData(self): """Compute result of widget processing and send to output""" # Skip if title list is empty: if self.myBasket == list(): self.infoBox.setText( "Your corpus is empty, please add some movies first", "warning") return # Clear created Inputs. self.clearCreatedInputs() self.controlArea.setDisabled(True) # Initialize progress bar. progressBar = ProgressBar(self, iterations=len(self.myBasket)) # Connect to imdb and add elements in lists list_review = list() list_annotation = list() annotations = list() try: for item in self.myBasket: movie = self.ia.get_movie_reviews(item['id']) movie_annotations = self.ia.get_movie(item['id']) list_review.append(movie) list_annotation.append(movie_annotations) # 1 tick on the progress bar of the widget progressBar.advance() # If an error occurs (e.g. http error, or memory error)... except: # Set Info box and widget to "error" state. self.infoBox.setText("Couldn't download data from imdb", "error") self.controlArea.setDisabled(False) return # Store movie critics strings in input objects... for movie in list_review: data = movie.get('data', "") reviews_data = data.get('reviews') for review in reviews_data: reviews = review.get('content') newInput = Input(reviews) self.createdInputs.append(newInput) for item in list_annotation: print(item) # Store the annotation as dicts in a separate list annotations_dict = {"title": item, "year": item["year"]} annot_dict_copy = annotations_dict.copy() for i in range(25): annotations.append(annot_dict_copy) print(annotations) # If there's only one item, the widget's output is the created Input. if len(self.createdInputs) == 1: self.segmentation = self.createdInputs[0] # Otherwise the widget's output is a concatenation... else: self.segmentation = Segmenter.concatenate( self.createdInputs, import_labels_as=None, ) # Annotate segments... for idx, segment in enumerate(self.segmentation): segment.annotations.update(annotations[idx]) self.segmentation[idx] = segment # Clear progress bar. progressBar.finish() self.controlArea.setDisabled(False) # Set status to OK and report data size... message = f"{len(self.segmentation)} segment@p sent to output" message = pluralize(message, len(self.segmentation)) numChars = 0 for segment in self.segmentation: segmentLength = len(Segmentation.get_data(segment.str_index)) numChars += segmentLength message += " (%i character@p)." % numChars message = pluralize(message, numChars) self.infoBox.setText(message) self.send('Segmentation', self.segmentation, self) self.sendButton.resetSettingsChangedFlag() def clearResults(self): """Clear the results list""" del self.titleLabels[:] self.titleLabels = self.titleLabels self.clearButton.setDisabled(True) self.addButton.setDisabled(self.titleLabels == list()) def clearCreatedInputs(self): """Delete all Input objects that have been created.""" for i in self.createdInputs: Segmentation.set_data(i[0].str_index, None) del self.createdInputs[:] def clearCorpus(self): """Remove all movies in the corpus""" self.mytitleLabels = list() self.myBasket = list() self.sendButton.settingsChanged() self.clearmyBasket.setDisabled(True)
class SpaCy(OWTextableBaseWidget): """Textable widget for NLP using spaCy.""" #---------------------------------------------------------------------- # Widget's metadata... name = "spaCy" description = "Natural language processing using spaCy" icon = "icons/spacy.svg" priority = 21 # TODO #---------------------------------------------------------------------- # Channel definitions... inputs = [("Text data", Segmentation, "inputData")] outputs = [("Linguistically analyzed data", Segmentation)] #---------------------------------------------------------------------- # Layout parameters... want_main_area = False #---------------------------------------------------------------------- # Settings... settingsHandler = VersionedSettingsHandler( version=__version__.rsplit(".", 1)[0]) autoSend = settings.Setting(False) model = settings.Setting("fr_core_news_sm") def __init__(self): """Widget creator.""" super().__init__() # Other attributes... self.inputSeg = None self.nlp = None # Next two instructions are helpers from TextableUtils. Corresponding # interface elements are declared here and actually drawn below (at # their position in the UI)... self.infoBox = InfoBox(widget=self.controlArea) self.sendButton = SendButton( widget=self.controlArea, master=self, callback=self.sendData, infoBoxAttribute="infoBox", sendIfPreCallback=None, ) # User interface... # Tabs... self.tabs = QTabWidget() self.optionsTab = QWidget() self.modelManagerTab = QWidget() self.tabs.addTab(self.optionsTab, "Options") self.tabs.addTab(self.modelManagerTab, "Model manager") # Options tab... OptionsTabBox = QHBoxLayout() optionsBox = gui.widgetBox(widget=self.optionsTab) self.modelComboBox = gui.comboBox( widget=optionsBox, master=self, value='model', label='Model: ', tooltip='Select the spaCy language model you want to use.', items=INSTALLED_MODELS, sendSelectedValue=True, callback=self.modelChanged, ) OptionsTabBox.addWidget(optionsBox) self.optionsTab.setLayout(OptionsTabBox) # Model manager tab... modelManagerTabBox = QHBoxLayout() modelManagerBox = gui.widgetBox(widget=self.modelManagerTab) # TODO: Model manager UI modelManagerTabBox.addWidget(modelManagerBox) self.modelManagerTab.setLayout(modelManagerTabBox) self.controlArea.layout().addWidget(self.tabs) gui.rubber(self.controlArea) # Now Info box and Send button must be drawn... self.sendButton.draw() self.infoBox.draw() self.infoBox.setText("Widget needs input", "warning") # Load spaCy language model... self.modelChanged() # Send data if autoSend. self.sendButton.sendIf() def inputData(self, newInput): """Process incoming data.""" self.inputSeg = newInput self.infoBox.inputChanged() self.sendButton.sendIf() def modelChanged(self): """Respond to model change in UI.""" self.nlp = spacy.load(self.model) self.sendButton.settingsChanged() def sendData(self): """Compute result of widget processing and send to output""" # Check that there's an input... if self.inputSeg is None: self.infoBox.setText("Widget needs input", "warning") self.send("Linguistically analyzed data", None, self) return # Initialize progress bar. self.infoBox.setText( u"Processing, please wait...", "warning", ) self.controlArea.setDisabled(True) progressBar = ProgressBar(self, iterations=len(self.inputSeg)) tokenizedSegments = list() # Process each input segment... for segment in self.inputSeg: # Input segment attributes... inputContent = segment.get_content() inputAnnotations = segment.annotations inputString = segment.str_index inputStart = segment.start or 0 inputEnd = segment.end or len(inputContent) # NLP analysis... doc = self.nlp(inputContent) # Process each token in input segment... for token in doc: tokenAnnotations = inputAnnotations.copy() tokenAnnotations.update({ k: getattr(token, k) for k in RELEVANT_KEYS if getattr(token, k) is not None }) tokenStart = inputStart + token.idx tokenizedSegments.append( Segment( str_index=inputString, start=tokenStart, end=tokenStart + len(token), annotations=tokenAnnotations, )) progressBar.advance() outputSeg = Segmentation(tokenizedSegments, self.captionTitle) # Set status to OK and report data size... message = "%i segment@p sent to output." % len(outputSeg) message = pluralize(message, len(outputSeg)) self.infoBox.setText(message) print(outputSeg.to_string()) # Clear progress bar. progressBar.finish() self.controlArea.setDisabled(False) # Send data to output... self.send("Linguistically analyzed data", outputSeg, self) self.sendButton.resetSettingsChangedFlag() # The following method needs to be copied verbatim in # every Textable widget that sends a segmentation... def setCaption(self, title): if 'captionTitle' in dir(self): changed = title != self.captionTitle super().setCaption(title) if changed: self.sendButton.settingsChanged() else: super().setCaption(title)
class SuperTextFiles(OWTextableBaseWidget): """Textable widget to import PDF files and if necessary to do an Optical Character Recognition (OCR)""" #---------------------------------------------------------------------- # Widget's metadata... name = "Super Text Files" description = "Import data from raw text and PDF files" icon = "icons/SuperTextFiles.svg" priority = 1 # TODO #---------------------------------------------------------------------- # Channel definitions.... inputs = [('Message', JSONMessage, "inputMessage", widget.Single)] outputs = [('Text data', Segmentation)] #---------------------------------------------------------------------- # Layout parameters... want_main_area = False #---------------------------------------------------------------------- # Settings... settingsHandler = VersionedSettingsHandler( version=__version__.rsplit(".", 1)[0]) files = settings.Setting([]) encoding = settings.Setting('(auto-detect)') autoNumber = settings.Setting(False) autoNumberKey = settings.Setting(u'num') importFilenames = settings.Setting(True) importFilenamesKey = settings.Setting(u'filename') lastLocation = settings.Setting('.') displayAdvancedSettings = settings.Setting(False) file = settings.Setting(u'') def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) # Other attributes... self.segmentation = None self.createdInputs = list() self.fileLabels = list() self.selectedFileLabels = list() self.newFiles = u'' self.newAnnotationKey = u'' self.newAnnotationValue = u'' self.pdfPassword = u'' # SuperTextFiles self.ocrForce = False # SuperTextFiles self.ocrLanguages = u'eng' # SuperTextFiles self.infoBox = InfoBox(widget=self.controlArea) self.sendButton = SendButton( widget=self.controlArea, master=self, callback=self.sendData, infoBoxAttribute='infoBox', sendIfPreCallback=self.updateGUI, ) self.advancedSettings = AdvancedSettings( widget=self.controlArea, master=self, callback=self.sendButton.settingsChanged, ) # GUI... # Advanced settings checkbox... self.advancedSettings.draw() # BASIC GUI... # Basic file box basicFileBox = gui.widgetBox( widget=self.controlArea, box=u'Source', orientation='vertical', addSpace=False, ) basicFileBoxLine1 = gui.widgetBox( widget=basicFileBox, box=False, orientation='horizontal', ) gui.lineEdit( widget=basicFileBoxLine1, master=self, value='file', orientation='horizontal', label=u'File path:', labelWidth=101, callback=self.sendButton.settingsChanged, tooltip=(u"The path of the file."), ) gui.separator(widget=basicFileBoxLine1, width=5) gui.button( widget=basicFileBoxLine1, master=self, label=u'Browse', callback=self.browse, tooltip=(u"Open a dialog for selecting file."), ) gui.separator(widget=basicFileBox, width=3) advancedEncodingsCombobox = gui.comboBox( widget=basicFileBox, master=self, value='encoding', items=getPredefinedEncodings(), sendSelectedValue=True, orientation='horizontal', label=u'Encoding:', labelWidth=101, callback=self.sendButton.settingsChanged, tooltip=(u"Select input file(s) encoding."), ) addSeparatorAfterDefaultEncodings(advancedEncodingsCombobox) addAutoDetectEncoding(advancedEncodingsCombobox) gui.separator(widget=basicFileBox, width=3) self.advancedSettings.basicWidgets.append(basicFileBox) self.advancedSettings.basicWidgetsAppendSeparator() # ADVANCED GUI... defaultLabelWidth = 120 # SuperTextFiles # File box fileBox = gui.widgetBox( widget=self.controlArea, box=u'Sources', orientation='vertical', addSpace=False, ) fileBoxLine1 = gui.widgetBox( widget=fileBox, box=False, orientation='horizontal', addSpace=True, ) self.fileListbox = gui.listBox( widget=fileBoxLine1, master=self, value='selectedFileLabels', labels='fileLabels', callback=self.updateFileBoxButtons, tooltip=( u"The list of files whose content will be imported.\n" u"\nIn the output segmentation, the content of each\n" u"file appears in the same position as in the list.\n" u"\nColumn 1 shows the file's name.\n" u"Column 2 shows the file's annotation (if any).\n" # Start SuperTextFiles # u"Column 3 shows the file's encoding." # removed u"Column 3 shows the file's password (if any).\n" u"Column 4 shows the file's languages (if any).\n" u"Column 5 shows if OCR is forced.\n" u"Column 6 shows the file's encoding." # End SuperTextFiles ), ) font = QFont() font.setFamily('Courier') font.setStyleHint(QFont.Courier) font.setPixelSize(12) self.fileListbox.setFont(font) fileBoxCol2 = gui.widgetBox( widget=fileBoxLine1, orientation='vertical', ) self.moveUpButton = gui.button( widget=fileBoxCol2, master=self, label=u'Move Up', callback=self.moveUp, tooltip=(u"Move the selected file upward in the list."), ) self.moveDownButton = gui.button( widget=fileBoxCol2, master=self, label=u'Move Down', callback=self.moveDown, tooltip=(u"Move the selected file downward in the list."), ) self.removeButton = gui.button( widget=fileBoxCol2, master=self, label=u'Remove', callback=self.remove, tooltip=(u"Remove the selected file from the list."), ) self.clearAllButton = gui.button( widget=fileBoxCol2, master=self, label=u'Clear All', callback=self.clearAll, tooltip=(u"Remove all files from the list."), ) self.exportButton = gui.button( widget=fileBoxCol2, master=self, label=u'Export List', callback=self.exportList, tooltip=(u"Open a dialog for selecting a file where the file\n" u"list can be exported in JSON format."), ) self.importButton = gui.button( widget=fileBoxCol2, master=self, label=u'Import List', callback=self.importList, tooltip=(u"Open a dialog for selecting a file list to\n" u"import (in JSON format). Files from this list\n" u"will be added to those already imported."), ) fileBoxLine2 = gui.widgetBox( widget=fileBox, box=False, orientation='vertical', ) # Add file box addFileBox = gui.widgetBox( widget=fileBoxLine2, box=True, orientation='vertical', ) addFileBoxLine1 = gui.widgetBox( widget=addFileBox, orientation='horizontal', ) gui.lineEdit( widget=addFileBoxLine1, master=self, value='newFiles', orientation='horizontal', label=u'File path(s):', labelWidth=defaultLabelWidth, callback=self.updateGUI, tooltip=(u"The paths of the files that will be added to the\n" u"list when button 'Add' is clicked.\n\n" u"Successive paths must be separated with ' / ' \n" u"(whitespace + slash + whitespace). Their order in\n" u"the list will be the same as in this field."), ) gui.separator(widget=addFileBoxLine1, width=5) gui.button( widget=addFileBoxLine1, master=self, label=u'Browse', callback=self.browse, tooltip=(u"Open a dialog for selecting files.\n\n" u"To select multiple files at once, either draw a\n" u"selection box around them, or use shift and/or\n" u"ctrl + click.\n\n" u"Selected file paths will appear in the field to\n" u"the left of this button afterwards, ready to be\n" u"added to the list when button 'Add' is clicked."), ) gui.separator(widget=addFileBox, width=3) basicEncodingsCombobox = gui.comboBox( widget=addFileBox, master=self, value='encoding', items=getPredefinedEncodings(), sendSelectedValue=True, orientation='horizontal', label=u'Encoding:', labelWidth=defaultLabelWidth, callback=self.updateGUI, tooltip=(u"Select input file(s) encoding."), ) addSeparatorAfterDefaultEncodings(basicEncodingsCombobox) addAutoDetectEncoding(basicEncodingsCombobox) self.encoding = self.encoding gui.separator(widget=addFileBox, width=3) gui.lineEdit( widget=addFileBox, master=self, value='newAnnotationKey', orientation='horizontal', label=u'Annotation key:', labelWidth=defaultLabelWidth, callback=self.updateGUI, tooltip=(u"This field lets you specify a custom annotation\n" u"key associated with each file that is about to be\n" u"added to the list."), ) gui.separator(widget=addFileBox, width=3) gui.lineEdit( widget=addFileBox, master=self, value='newAnnotationValue', orientation='horizontal', label=u'Annotation value:', labelWidth=defaultLabelWidth, callback=self.updateGUI, tooltip=(u"This field lets you specify the annotation value\n" u"associated with the above annotation key."), ) ### Start SuperTextFiles addition gui.separator(widget=addFileBox, width=3) # Field for PDF password gui.lineEdit( widget=addFileBox, master=self, value='pdfPassword', orientation='horizontal', label=u'PDF password:'******'ocrLanguages', orientation='horizontal', label=u'OCR Language(s):', labelWidth=defaultLabelWidth, callback=self.updateGUI, tooltip=(u"This field lets you specify languages\n" u"for the OCR process. Ex.: fra+ita"), ) gui.checkBox( widget=addFileBox, master=self, value='ocrForce', label=u'Force OCR', labelWidth=defaultLabelWidth, callback=self.updateGUI, tooltip=(u"Force to use an OCR detection on this file"), ) ### End SuperTextFiles addition gui.separator(widget=addFileBox, width=3) self.addButton = gui.button( widget=addFileBox, master=self, label=u'Add', callback=self.add, tooltip=(u"Add the file(s) currently displayed in the\n" u"'Files' text field to the list.\n\n" u"Each of these files will be associated with the\n" u"specified encoding and annotation (if any).\n\n" u"Other files may be selected afterwards and\n" u"assigned a different encoding and annotation."), ) self.advancedSettings.advancedWidgets.append(fileBox) self.advancedSettings.advancedWidgetsAppendSeparator() # Options box... optionsBox = gui.widgetBox( widget=self.controlArea, box=u'Options', orientation='vertical', addSpace=False, ) optionsBoxLine1 = gui.widgetBox( widget=optionsBox, box=False, orientation='horizontal', ) gui.checkBox( widget=optionsBoxLine1, master=self, value='importFilenames', label=u'Import file names with key:', labelWidth=180, callback=self.sendButton.settingsChanged, tooltip=(u"Import file names as annotations."), ) self.importFilenamesKeyLineEdit = gui.lineEdit( widget=optionsBoxLine1, master=self, value='importFilenamesKey', orientation='horizontal', callback=self.sendButton.settingsChanged, tooltip=(u"Annotation key for importing file names."), ) gui.separator(widget=optionsBox, width=3) optionsBoxLine2 = gui.widgetBox( widget=optionsBox, box=False, orientation='horizontal', ) gui.checkBox( widget=optionsBoxLine2, master=self, value='autoNumber', label=u'Auto-number with key:', labelWidth=180, callback=self.sendButton.settingsChanged, tooltip=(u"Annotate files with increasing numeric indices."), ) self.autoNumberKeyLineEdit = gui.lineEdit( widget=optionsBoxLine2, master=self, value='autoNumberKey', orientation='horizontal', callback=self.sendButton.settingsChanged, tooltip=(u"Annotation key for file auto-numbering."), ) gui.separator(widget=optionsBox, width=3) self.advancedSettings.advancedWidgets.append(optionsBox) self.advancedSettings.advancedWidgetsAppendSeparator() gui.rubber(self.controlArea) # Send button... self.sendButton.draw() # Info box... self.infoBox.draw() self.adjustSizeWithTimer() QTimer.singleShot(0, self.sendButton.sendIf) def inputMessage(self, message): """Handle JSON message on input connection""" if not message: return self.displayAdvancedSettings = True self.advancedSettings.setVisible(True) self.clearAll() self.infoBox.inputChanged() try: json_data = json.loads(message.content) temp_files = list() for entry in json_data: path = entry.get('path', '') encoding = entry.get('encoding', '') annotationKey = entry.get('annotation_key', '') annotationValue = entry.get('annotation_value', '') pdfPassword = entry.get('pdf_password', '') # SuperTextFiles ocrLanguages = entry.get('ocr_languages', '') # SuperTextFiles ocrForce = entry.get('ocr_force', '') # SuperTextFiles if path == '' or encoding == '' or ocrForce == '': self.infoBox.setText( u"Please verify keys and values of incoming " u"JSON message.", 'error') self.send('Text data', None, self) return temp_files.append(( path, encoding, annotationKey, annotationValue, pdfPassword, # SuperTextFiles ocrLanguages, # SuperTextFiles ocrForce, # SuperTextFiles )) self.files.extend(temp_files) self.sendButton.settingsChanged() except ValueError: self.infoBox.setText( u"Please make sure that incoming message is valid JSON.", 'error') self.send('Text data', None, self) return def sendData(self): """Load files, create and send segmentation""" # Check that there's something on input... if ((self.displayAdvancedSettings and not self.files) or not (self.file or self.displayAdvancedSettings)): self.infoBox.setText(u'Please select input file.', 'warning') self.send('Text data', None, self) return # Check that autoNumberKey is not empty (if necessary)... if self.displayAdvancedSettings and self.autoNumber: if self.autoNumberKey: autoNumberKey = self.autoNumberKey else: self.infoBox.setText( u'Please enter an annotation key for auto-numbering.', 'warning') self.send('Text data', None, self) return else: autoNumberKey = None # Clear created Inputs... self.clearCreatedInputs() fileContents = list() annotations = list() counter = 1 if self.displayAdvancedSettings: myFiles = self.files else: myFiles = [[self.file, self.encoding, "", "", "", "eng", False]] self.infoBox.setText(u"Processing, please wait...", "warning") self.controlArea.setDisabled(True) progressBar = ProgressBar(self, iterations=len(myFiles)) # Open and process each file successively... for myFile in myFiles: filePath = myFile[0] encoding = myFile[1] encoding = re.sub(r"[ ]\(.+", "", encoding) annotation_key = myFile[2] annotation_value = myFile[3] pdf_password = myFile[4] # SuperTextFiles ocr_languages = myFile[5] # SuperTextFiles ocr_force = myFile[6] # SuperTextFiles myFiletype = filetype.guess(myFile[0]) # SuperTextFiles # Try to open the file... self.error() # Start SuperTextFiles try: if myFiletype is None: fileContent = self.extract_raw_text(filePath, encoding) elif myFiletype.extension == "pdf": if ocr_force is True: fileContent = self.get_pdf_content( filePath, ocr_languages, ) else: if self.is_textual_pdf_file(filePath) is True: fileContent = self.extract_text_from_pdf(filePath) else: fileContent = self.get_pdf_content( filePath, ocr_languages, ) elif myFiletype.extension in IMG_FILETYPES: fileContent = self.ocrize(filePath, ocr_languages) if fileContent == -1: message = u"Couldn't open file." self.infoBox.setText(message, 'error') self.send('Text data', None, self) self.controlArea.setDisabled(False) return # End SuperTextFiles except IOError as e: if "tesseract" in str(e): QMessageBox.warning(None, 'Textable', str(e), QMessageBox.Ok) progressBar.finish() if len(myFiles) > 1: message = u"Couldn't open file '%s'." % filePath else: message = u"Couldn't open file." self.infoBox.setText(message, 'error') self.send('Text data', None, self) self.controlArea.setDisabled(False) return # Remove utf-8 BOM if necessary... if encoding == u'utf-8': fileContent = fileContent.lstrip( codecs.BOM_UTF8.decode('utf-8')) # Normalize text (canonical decomposition then composition)... fileContent = normalize('NFC', fileContent) fileContents.append(fileContent) # Annotations... annotation = dict() if self.displayAdvancedSettings: if annotation_key and annotation_value: annotation[annotation_key] = annotation_value if self.importFilenames and self.importFilenamesKey: filename = os.path.basename(filePath) annotation[self.importFilenamesKey] = filename if self.autoNumber and self.autoNumberKey: annotation[self.autoNumberKey] = counter counter += 1 annotations.append(annotation) progressBar.advance() # Create an LTTL.Input for each file... if len(fileContents) == 1: label = self.captionTitle else: label = None for index in range(len(fileContents)): myInput = Input(fileContents[index], label) segment = myInput[0] segment.annotations.update(annotations[index]) myInput[0] = segment self.createdInputs.append(myInput) # If there's only one file, the widget's output is the created Input. if len(fileContents) == 1: self.segmentation = self.createdInputs[0] # Otherwise the widget's output is a concatenation... else: self.segmentation = Segmenter.concatenate( segmentations=self.createdInputs, label=self.captionTitle, copy_annotations=True, import_labels_as=None, sort=False, auto_number_as=None, merge_duplicates=False, progress_callback=None, ) message = u'%i segment@p sent to output ' % len(self.segmentation) message = pluralize(message, len(self.segmentation)) numChars = 0 for segment in self.segmentation: segmentLength = len(Segmentation.get_data(segment.str_index)) numChars += segmentLength message += u'(%i character@p).' % numChars message = pluralize(message, numChars) self.infoBox.setText(message) progressBar.finish() self.controlArea.setDisabled(False) self.send('Text data', self.segmentation, self) self.sendButton.resetSettingsChangedFlag() def extract_raw_text(self, filePath, encoding): """This function receive a filePath and an encoding value and return a string with the text of the given file.""" if encoding == "(auto-detect)": detector = UniversalDetector() fh = open(filePath, 'rb') for line in fh: detector.feed(line) if detector.done: break detector.close() fh.close() encoding = detector.result['encoding'] fh = open( filePath, mode='rU', encoding=encoding, ) try: i = 0 fileContent = "" chunks = list() for chunk in iter(lambda: fh.read(CHUNK_LENGTH), ""): chunks.append('\n'.join(chunk.splitlines())) i += CHUNK_LENGTH if i % (CHUNK_NUM * CHUNK_LENGTH) == 0: fileContent += "".join(chunks) chunks = list() if len(chunks): fileContent += "".join(chunks) del chunks return fileContent except UnicodeError: progressBar.finish() if len(myFiles) > 1: message = u"Please select another encoding " \ + u"for file %s." % filePath else: message = u"Please select another encoding." self.infoBox.setText(message, 'error') self.send('Text data', None, self) self.controlArea.setDisabled(False) return finally: fh.close() def is_textual_pdf_file(self, filePath): """Evaluate the content of the pdf file""" with pdfplumber.open(filePath, password=self.pdfPassword) as fh: first_page = fh.pages[0] text = first_page.extract_text() if text is None or text.isspace() is True: return False else: return True def extract_text_from_pdf(self, filePath): """Extract all readable text contents""" fileContent = "" with pdfplumber.open(filePath, password=self.pdfPassword) as fh: for page in fh.pages: fileContent += page.extract_text() return fileContent def get_pdf_content(self, filePath, languages): """ First this function get all texts in the file if exist. Then it creates a list of pictures to make the OCR method.""" text = "" with fitz.open(filePath) as doc: images = [] for page in doc: text += page.getText("text") images += doc.getPageImageList(page.number) for image in images: xref = image[0] picture = fitz.Pixmap(doc, xref) if picture.n > 4: # CMYK colorspace picture = fitz.Pixmap(fitz.csRGB, picture) # convert to RGB bytes_img = BytesIO(picture.getImageData()) page_text = self.ocrize(bytes_img, languages) if page_text == -1: text = -1 break elif page_text: text += page_text return text def ocrize(self, image, languages): """Make an OCR on a list of images or an image file""" languages = languages.strip() # remove trailing spaces if languages == "": languages = "eng" try: ocrized_text = image_to_string(Image.open(image), lang=languages) return ocrized_text except TesseractError as e: if "load" in str(e): QMessageBox.warning( None, 'Textable', "Please make sure all Tesseract parameter files for " "language(s) '%s' have been installed." % languages, QMessageBox.Ok) return -1 def clearCreatedInputs(self): for i in self.createdInputs: Segmentation.set_data(i[0].str_index, None) del self.createdInputs[:] def importList(self): """Display a FileDialog and import file list""" filePath, _ = QFileDialog.getOpenFileName(self, u'Import File List', self.lastLocation, u'Text files (*)') if not filePath: return self.file = os.path.normpath(filePath) self.lastLocation = os.path.dirname(filePath) self.error() try: fileHandle = codecs.open(filePath, encoding='utf8') fileContent = fileHandle.read() fileHandle.close() except IOError: QMessageBox.warning(None, 'Textable', "Couldn't open file.", QMessageBox.Ok) return try: json_data = json.loads(fileContent) temp_files = list() for entry in json_data: path = entry.get('path', '') encoding = entry.get('encoding', '') annotationKey = entry.get('annotation_key', '') annotationValue = entry.get('annotation_value', '') pdfPassword = entry.get('pdf_password', '') # SuperTextFiles ocrLanguages = entry.get('ocr_languages', '') # SuperTextFiles ocrForce = entry.get('ocr_force', '') # SuperTextFiles if path == '' or encoding == '' or ocrForce == '': QMessageBox.warning( None, 'Textable', "Selected JSON file doesn't have the right keys " "and/or values.", QMessageBox.Ok) return temp_files.append(( path, encoding, annotationKey, annotationValue, pdfPassword, # SuperTextFiles ocrLanguages, # SuperTextFiles ocrForce, # SuperTextFiles )) self.files.extend(temp_files) if temp_files: self.sendButton.settingsChanged() except ValueError: QMessageBox.warning(None, 'Textable', "JSON parsing error.", QMessageBox.Ok) return def exportList(self): """Display a FileDialog and export file list""" toDump = list() for myfile in self.files: toDump.append({ 'path': myfile[0], 'encoding': myfile[1], }) if myfile[2] and myfile[3]: toDump[-1]['annotation_key'] = myfile[2] toDump[-1]['annotation_value'] = myfile[3] # Start SuperTextFiles if myfile[4]: toDump[-1]['pdf_password'] = myfile[4] if myfile[5]: toDump[-1]['ocr_languages'] = myfile[5] toDump[-1]['ocr_force'] = myfile[6] # End SuperTextFiles filePath, _ = QFileDialog.getSaveFileName( self, u'Export File List', self.lastLocation, ) if filePath: self.lastLocation = os.path.dirname(filePath) outputFile = codecs.open( filePath, encoding='utf8', mode='w', errors='xmlcharrefreplace', ) outputFile.write( normalizeCarriageReturns( json.dumps(toDump, sort_keys=True, indent=4))) outputFile.close() QMessageBox.information(None, 'Textable', 'File list correctly exported', QMessageBox.Ok) def browse(self): """Display a FileDialog and select files""" if self.displayAdvancedSettings: filePathList, _ = QFileDialog.getOpenFileNames( self, u'Select Text File(s)', self.lastLocation, u'Text files (*)') if not filePathList: return filePathList = [os.path.normpath(f) for f in filePathList] self.newFiles = u' / '.join(filePathList) self.lastLocation = os.path.dirname(filePathList[-1]) self.updateGUI() else: filePath, _ = QFileDialog.getOpenFileName(self, u'Open Text File', self.lastLocation, u'Text files (*)') if not filePath: return self.file = os.path.normpath(filePath) self.lastLocation = os.path.dirname(filePath) self.updateGUI() self.sendButton.settingsChanged() def moveUp(self): """Move file upward in Files listbox""" if self.selectedFileLabels: index = self.selectedFileLabels[0] if index > 0: temp = self.files[index - 1] self.files[index - 1] = self.files[index] self.files[index] = temp self.selectedFileLabels = [index - 1] self.sendButton.settingsChanged() def moveDown(self): """Move file downward in Files listbox""" if self.selectedFileLabels: index = self.selectedFileLabels[0] if index < len(self.files) - 1: temp = self.files[index + 1] self.files[index + 1] = self.files[index] self.files[index] = temp self.selectedFileLabels = [index + 1] self.sendButton.settingsChanged() def clearAll(self): """Remove all files from files attr""" del self.files[:] del self.selectedFileLabels[:] self.sendButton.settingsChanged() def remove(self): """Remove file from files attr""" if self.selectedFileLabels: index = self.selectedFileLabels[0] self.files.pop(index) del self.selectedFileLabels[:] self.sendButton.settingsChanged() def add(self): """Add files to files attr""" filePathList = re.split(r' +/ +', self.newFiles) for filePath in filePathList: encoding = re.sub(r"[ ]\(.+", "", self.encoding) self.files.append(( filePath, encoding, self.newAnnotationKey, self.newAnnotationValue, self.pdfPassword, # SuperTextFiles self.ocrLanguages, # SuperTextFiles self.ocrForce, # SuperTextFiles )) self.sendButton.settingsChanged() def updateGUI(self): """Update GUI state""" if self.displayAdvancedSettings: if self.selectedFileLabels: cachedLabel = self.selectedFileLabels[0] else: cachedLabel = None del self.fileLabels[:] if self.files: filePaths = [f[0] for f in self.files] filenames = [os.path.basename(p) for p in filePaths] encodings = [f[1] for f in self.files] annotations = ['{%s: %s}' % (f[2], f[3]) for f in self.files] maxFilenameLen = max([len(n) for n in filenames]) maxAnnoLen = max([len(a) for a in annotations]) # Start SuperTextFiles pdfPassword = [f[4] for f in self.files] ocrLanguages = [f[5] for f in self.files] ocrForce = [str(f[6]) for f in self.files] maxPdfPasswordLen = max([len(n) for n in pdfPassword]) maxOcrLanguagesLen = max([len(n) for n in ocrLanguages]) # End SuperTextFiles for index in range(len(self.files)): format = u'%-' + str(maxFilenameLen + 2) + u's' fileLabel = format % filenames[index] if maxAnnoLen > 4: if len(annotations[index]) > 4: format = u'%-' + str(maxAnnoLen + 2) + u's' fileLabel += format % annotations[index] else: fileLabel += u' ' * (maxAnnoLen + 2) # Start SuperTextFiles format = u'%-' + str(maxPdfPasswordLen + 2) + u's' fileLabel += format % pdfPassword[index] format = u'%-' + str(maxOcrLanguagesLen + 2) + u's' fileLabel += format % ocrLanguages[index] format = u'%-' + str(5 + 2) + u's' fileLabel += format % ocrForce[index] # End SuperTextFiles fileLabel += encodings[index] self.fileLabels.append(fileLabel) self.fileLabels = self.fileLabels if cachedLabel is not None: self.sendButton.sendIfPreCallback = None self.selectedFileLabels = [cachedLabel] self.sendButton.sendIfPreCallback = self.updateGUI if self.newFiles: if ((self.newAnnotationKey and self.newAnnotationValue) or (not self.newAnnotationKey and not self.newAnnotationValue)): self.addButton.setDisabled(False) else: self.addButton.setDisabled(True) else: self.addButton.setDisabled(True) if self.autoNumber: self.autoNumberKeyLineEdit.setDisabled(False) else: self.autoNumberKeyLineEdit.setDisabled(True) if self.importFilenames: self.importFilenamesKeyLineEdit.setDisabled(False) else: self.importFilenamesKeyLineEdit.setDisabled(True) self.updateFileBoxButtons() self.advancedSettings.setVisible(True) else: self.advancedSettings.setVisible(False) def updateFileBoxButtons(self): """Update state of File box buttons""" if self.selectedFileLabels: self.removeButton.setDisabled(False) if self.selectedFileLabels[0] > 0: self.moveUpButton.setDisabled(False) else: self.moveUpButton.setDisabled(True) if self.selectedFileLabels[0] < len(self.files) - 1: self.moveDownButton.setDisabled(False) else: self.moveDownButton.setDisabled(True) else: self.moveUpButton.setDisabled(True) self.moveDownButton.setDisabled(True) self.removeButton.setDisabled(True) if len(self.files): self.clearAllButton.setDisabled(False) self.exportButton.setDisabled(False) else: self.clearAllButton.setDisabled(True) self.exportButton.setDisabled(True) def setCaption(self, title): if 'captionTitle' in dir(self): changed = title != self.captionTitle super().setCaption(title) if changed: self.sendButton.settingsChanged() else: super().setCaption(title) def onDeleteWidget(self): self.clearCreatedInputs()
class TextSummarizer(OWTextableBaseWidget): """Textable widget for summarizing a segment in a selected language.""" #---------------------------------------------------------------------- # Widget's metadata... name = "TL;DR" description = "Summarize texts with spaCy models" icon = "icons/TL_DR_icon.svg" priority = 21 #---------------------------------------------------------------------- # Channel definitions... inputs = [("Segmentation", Segmentation, "inputData")] outputs = [("Summary", Segmentation, widget.Default), ("HTML_Summary", Segmentation)] #---------------------------------------------------------------------- # GUI layout parameters... want_main_area = False #---------------------------------------------------------------------- # Settings - defines set values when opening widget numSents = settings.Setting(5) language = settings.Setting("English") typeSeg = settings.Setting("Summarize each segments individually") percentage = settings.Setting(20) method = settings.Setting("Number of sentences") #---------------------------------------------------------------------- # The following lines need to be copied verbatim in every Textable widget... settingsHandler = VersionedSettingsHandler( version=__version__.rsplit(".", 1)[0]) def __init__(self): """Widget creator.""" super().__init__() #---------------------------------------------------------------------- # Other (non settings) attributes... self.inputSeg = None self.outputSeg = None self.html_outputSeg = None self.nlp = None self.cv = None if INSTALLED_MODELS: self.model = INSTALLED_MODELS[0] else: self.model = "" #---------------------------------------------------------------------- # Next two instructions are helpers from TextableUtils. Corresponding # interface elements are declared here and actually drawn below (at # their position in the UI)... self.infoBox = InfoBox(widget=self.controlArea) self.sendButton = SendButton( widget=self.controlArea, master=self, callback=self.sendData, infoBoxAttribute="infoBox", sendIfPreCallback=None, ) #---------------------------------------------------------------------- # User interface... self.languageCombo = gui.comboBox( widget=self.controlArea, master=self, value="language", items=INSTALLED_MODELS[:], sendSelectedValue=True, orientation="horizontal", label="Input's language:", labelWidth=135, # Appeler autre méthode callback=self.languageChanged, tooltip=("Please select the text's language.\n"), ) box = gui.widgetBox(self.controlArea, "Language informations:") self.infoa = gui.widgetLabel( box, "More languages are available. \nTo access them, please use the spaCy widget to \ndownload the model first." ) self.lenghtMethodCombo = gui.comboBox( widget=self.controlArea, master=self, value="method", items=[ "Number of sentences", "Percentage of input's length", ], sendSelectedValue=True, orientation="horizontal", label="Define summary's length by:", labelWidth=180, #Add below call to method that activate/deactivate self.numSentsSpin or self.percentageSpin callback=self.summaryGui, tooltip=("How do you want to choose the summary's length ?"), ) self.numSentsSpin = gui.spin( widget=self.controlArea, master=self, value='numSents', label='Number of sentences : ', callback=self.sendButton.sendIf(), labelWidth=180, tooltip=('Select the number of sentences wanted for the summary.'), # Define max sentences according to input maxv=10, minv=1, step=1, ) self.percentageSpin = gui.spin( widget=self.controlArea, master=self, value='percentage', label='Length in %:', labelWidth=180, callback=self.sendButton.sendIf(), tooltip= ('Select the length of the summary in percentage of the input text.' ), maxv=99, minv=1, step=1, ) self.segmentBox = gui.comboBox( widget=self.controlArea, master=self, value="typeSeg", items=[ "Summarize all segments as one", "Summarize each segments individually", ], sendSelectedValue=True, orientation="horizontal", label="Segmentation", labelWidth=135, callback=self.maxNumSents, tooltip=("How should the input segments be summarized ? \n"), ) gui.rubber(self.controlArea) #---------------------------------------------------------------------- # Draw Info box and Send button... self.sendButton.draw() self.infoBox.draw() self.infoBox.setText("Widget needs input.", "warning") # Check that there's a model and if not call noLanguageModelWarning() if not self.model: self.noLanguageModelWarning() #---------------------------------------------------------------------------- def inputData(self, segmentation): """Process incoming data.""" self.inputSeg = segmentation if self.inputSeg is None: self.infoBox.setText("Widget needs input.", "warning") self.send('Summary', None, self) self.send('HTML_Summary', None, self) return # Load default language model self.cv = self.loadModelEN() # Set max number of sentence of summary self.maxNumSents() # Disable percentageSpin OR numSentsSpin self.summaryGui() # Set segmentBox visible OR unvisible self.segmentBoxState() self.infoBox.inputChanged() self.sendButton.sendIf() def noLanguageModelWarning(self): """"Warn user that a spaCy model must be installed and disable GUI.""" self.infoBox.setText( "Please use the spaCy widget to download a language " "model first.", "warning", ) self.controlArea.setDisabled(True) def maxNumSents(self): """Set numSentsSpin.maxv according to inputSeg""" fusionStrategy = sum if self.typeSeg == "Summarize all segments as one" else min self.sendButton.settingsChanged() self.numSentsSpin.setMaximum( fusionStrategy( len(list(self.nlp(seg.get_content()).sents)) for seg in self.inputSeg)) def languageChanged(self): """Load the appropriate model according to user choice""" self.infoBox.setText( u"Loading model, please wait...", "warning", ) self.controlArea.setDisabled(True) progressBar = ProgressBar(self, iterations=1) if self.language == "French": self.cv = self.loadModelFR() elif self.language == "English": self.cv = self.loadModelEN() elif self.language == "Portuguese": self.cv = self.loadModelPT() elif self.language == "Dutch": self.cv = self.loadModelLN() elif self.language == "German": self.cv = self.loadModelDE() elif self.language == "Greek": self.cv = self.loadModelEL() elif self.language == "Italian": self.cv = self.loadModelIT() elif self.language == "Lithuanian": self.cv = self.loadModelLT() elif self.language == "Norwegian": self.cv = self.loadModelNB() elif self.language == "Spanish": self.cv = self.loadModelES() progressBar.advance() progressBar.finish() self.controlArea.setDisabled(False) self.sendButton.settingsChanged() def summaryGui(self): """Disable percentageSpin or numSentsSpin""" if self.method == "Number of sentences": self.percentageSpin.setVisible(False) self.percentageSpin.label.setVisible(False) self.numSentsSpin.setVisible(True) self.numSentsSpin.label.setVisible(True) elif self.method == "Percentage of input's length": self.percentageSpin.setVisible(True) self.percentageSpin.label.setVisible(True) self.numSentsSpin.setVisible(False) self.numSentsSpin.label.setVisible(False) self.sendButton.settingsChanged() def segmentBoxState(self): """Hide segmentBox GUI if input is only one segment Show if input is 1+ segments""" if len(self.inputSeg) > 1: self.segmentBox.setVisible(True) self.segmentBox.label.setVisible(True) elif len(self.inputSeg) == 1: self.segmentBox.setVisible(False) self.segmentBox.label.setVisible(False) ################################################################ # Called when send button is clicked ################################################################ def sendData(self): """Compute result of widget processing and send to output.""" # Check that there's a model if not self.model: self.noLanguageModelWarning() return # Check that there's an input if self.inputSeg is None: self.infoBox.setText("Widget needs input.", "warning") self.send('Summary', None, self) self.send('HTML_Summary', None, self) return # Initialize progress bar. self.infoBox.setText( u"Processing, please wait...", "warning", ) self.controlArea.setDisabled(True) # Type of segmentation (per segment or per segmentation) segments = list() html_segments = list() if self.typeSeg == "Summarize each segments individually": # Process each segment separately, then create segmentation for segment in self.inputSeg: content = segment.get_content() resume, html_resume = self.summarize(self.cv, content) segments.append(Segment(str_index=resume[0].str_index, )) html_segments.append( Segment(str_index=html_resume[0].str_index, )) elif self.typeSeg == "Summarize all segments as one": merged_seg = " ".join( [segment.get_content() for segment in self.inputSeg]) resume, html_resume = self.summarize(self.cv, merged_seg) segments.append(Segment(str_index=resume[0].str_index, )) html_segments.append(Segment(str_index=html_resume[0].str_index, )) # Create segmentation from segment() and assign it to the output self.outputSeg = Segmentation(segments, self.captionTitle) self.html_outputSeg = Segmentation(html_segments, self.captionTitle) # Send segmentation to output channels self.send("Summary", self.outputSeg, self) self.send('HTML_Summary', self.html_outputSeg, self) # Set message to sent message = "%i segment@p sent to output " % len(self.outputSeg) message = pluralize(message, len(self.outputSeg)) self.infoBox.setText(message) self.sendButton.resetSettingsChangedFlag() self.controlArea.setDisabled(False) ################################################################ # Main function ################################################################ def summarize(self, cv, content): "Main function that summarize the text" progressBar = ProgressBar(self, iterations=3) doc = self.nlp(content) corpus = [sent.text.lower() for sent in doc.sents] cv_fit = self.cv.fit_transform(corpus) # Count unique words and how many times they appear word_list = self.cv.get_feature_names() count_list = cv_fit.toarray().sum(axis=0) word_frequency = dict(zip(word_list, count_list)) # Get sorted dict of word frequency and print the top to test val = sorted(word_frequency.values()) higher_word_frequencies = [ word for word, freq in word_frequency.items() if freq in val[-3:] ] # gets relative frequency of words to frequent words higher_frequency = val[-1] for word in word_frequency.keys(): word_frequency[word] = (word_frequency[word] / higher_frequency) progressBar.advance() # Initialise a sentence dictionnary sentence_rank = {} # For each word in each sentence ... for sent in doc.sents: count = 0 for word in sent: count += 1 # if the word appears in word_frequency dict if word.text.lower() in word_frequency.keys(): # If the sentence is already in sentence_rank dict, we add points if sent in sentence_rank.keys(): sentence_rank[sent] += word_frequency[ word.text.lower()] # else we create a new key/value pair in dict else: sentence_rank[sent] = word_frequency[word.text.lower()] # Normalize: divide score of current sentence by number of words if sentence_rank.get(sent, None) != None: sentence_rank[sent] = (sentence_rank.get(sent) / count) progressBar.advance() # Sort sentences top_sentences = (sorted(sentence_rank.values())[::-1]) # This is where we can choose how many sentences we want to keep for the summary # Depending on the choosen method: sentences or % if self.method == "Number of sentences": top_sent = top_sentences[:self.numSents] elif self.method == "Percentage of text lenght": percentSent = int(round(self.percentage * len(sentence_rank) / 100)) top_sent = top_sentences[:percentSent] summary = list() for sent, strength in sentence_rank.items(): if strength in top_sent: summary.append(sent) else: continue progressBar.advance() #Summary contains spacy.tokens.span.Span that must be converted to string summary_str = [str(i) for i in summary] # Join all sentence in a single string resume = " ".join(summary_str) # Create HTML resume html_summary = list() for sent in doc.sents: if sent in summary: new_sent = '<b style=\'color:blue\'>' + str(sent) + '</b>' html_summary.append(new_sent) else: html_summary.append(sent) #html_summary contains spacy.tokens.span.Span that must be converted to string html_summary_str = [str(i) for i in html_summary] # Join all sentence in a single string html_resume = "<!DOCTYPE html>\n<html>\n<body>\n" + " ".join( html_summary_str) + "\n</body>\n</html>" progressBar.finish() # Create ouput segmentation from summary return Input(resume), Input(html_resume) ################################################################ # loadmodelEN(), loadmodelFR() and loadmodelPT() load choosen model ################################################################ def loadModelEN(self): """(Re-)load language model if needed.""" self.nlp = spacy.load("en_core_web_sm") from spacy.lang.en.stop_words import STOP_WORDS cv = CountVectorizer(stop_words=list(STOP_WORDS)) return cv def loadModelFR(self): """(Re-)load language model if needed.""" self.nlp = spacy.load("fr_core_news_sm") from spacy.lang.fr.stop_words import STOP_WORDS cv = CountVectorizer(stop_words=list(STOP_WORDS)) return cv def loadModelPT(self): """(Re-)load language model if needed.""" self.nlp = spacy.load("pt_core_news_sm") from spacy.lang.pt.stop_words import STOP_WORDS cv = CountVectorizer(stop_words=list(STOP_WORDS)) return cv def loadModelNL(self): """(Re-)load language model if needed.""" self.nlp = spacy.load("nl_core_news_sm") from spacy.lang.nl.stop_words import STOP_WORDS cv = CountVectorizer(stop_words=list(STOP_WORDS)) return cv def loadModelDE(self): """(Re-)load language model if needed.""" self.nlp = spacy.load("de_core_news_sm") from spacy.lang.de.stop_words import STOP_WORDS cv = CountVectorizer(stop_words=list(STOP_WORDS)) return cv def loadModelEL(self): """(Re-)load language model if needed.""" self.nlp = spacy.load("el_core_news_sm") from spacy.lang.el.stop_words import STOP_WORDS cv = CountVectorizer(stop_words=list(STOP_WORDS)) return cv def loadModelIT(self): """(Re-)load language model if needed.""" self.nlp = spacy.load("it_core_news_sm") from spacy.lang.it.stop_words import STOP_WORDS cv = CountVectorizer(stop_words=list(STOP_WORDS)) return cv def loadModelLT(self): """(Re-)load language model if needed.""" self.nlp = spacy.load("lt_core_news_sm") from spacy.lang.lt.stop_words import STOP_WORDS cv = CountVectorizer(stop_words=list(STOP_WORDS)) return cv def loadModelNB(self): """(Re-)load language model if needed.""" self.nlp = spacy.load("nb_core_news_sm") from spacy.lang.nb.stop_words import STOP_WORDS cv = CountVectorizer(stop_words=list(STOP_WORDS)) return cv def loadModelES(self): """(Re-)load language model if needed.""" self.nlp = spacy.load("es_core_news_sm") from spacy.lang.es.stop_words import STOP_WORDS cv = CountVectorizer(stop_words=list(STOP_WORDS)) return cv #-------------------------------------------------------------- # The following method needs to be copied verbatim in # every Textable widget that sends a segmentation... def setCaption(self, title): if 'captionTitle' in dir(self): changed = title != self.captionTitle super().setCaption(title) if changed: self.sendButton.settingsChanged() else: super().setCaption(title)