def __init__(self, train_docs=None, train_labels=None, MAX_ITERATIONS=100, dev_docs=None, dev_labels=None, weights=None, biases=None): pre_syll = [''] self.CLASSES = ['too_long'] for i in range(1, 6): next_syll = add_syll(pre_syll) pre_syll = next_syll self.CLASSES += next_syll #print(self.CLASSES) self.MAX_ITERATIONS = MAX_ITERATIONS if weights is not None and biases is not None: with open(util.path_to_data_directory() + weights, 'rb') as wfile: self.weights = pickle.load(wfile) with open(util.path_to_data_directory() + biases, 'rb') as bfile: self.biases = pickle.load(bfile) else: #self.dev_docs = dev_docs #self.dev_labels = dev_labels self.weights = {l: Counter() for l in self.CLASSES} self.biases = {l: 0 for l in self.CLASSES} self.learn(train_docs, train_labels)
def clean_abbreviations(): """ Loads the abbreviations from disk Selects the first word from the potential list of matches: "Temp. -> Temporal, Temporary" is converted to "Temp. -> Temporal" Gets rid of any optional parts of the abbreviation: "Improp. -> Improper(ly)" is converted to "Improp. -> Improper" :return: """ abbreviations = load_abbreviations() paren_pattern = re.compile("[(]\w+[)]", flags=re.UNICODE) for abbreviation in abbreviations: # If things are a list of potential matches, take the first one if "," in abbreviations[abbreviation]: abbreviations[abbreviation] = abbreviations[abbreviation].split( ",")[0].strip() # Remove everything in parentheses abbreviations[abbreviation] = paren_pattern.sub( r'', abbreviations[abbreviation]) # Convert everything to uppercase uppercase_abbreviations = {} for abbreviation in abbreviations: uppercase_abbreviations[ abbreviation.upper()] = abbreviations[abbreviation].upper() # Write the new thing to disk so that we don't have to do this again with open(util.path_to_data_directory() + "abbreviations.json", "w") as f: json.dump(uppercase_abbreviations, f) return uppercase_abbreviations
def get_abbreviations(): """ Calls to the Oxford English Dictionary abbreviations page and parses out the abbreviations into a dictionary of `abbreviation`:`meaning` pairs. Also writes the dictionary of abbreviations to a local json file called abbreviations.json in the data folder :return: The dictionary of abbreviations """ # We'll use this to store the eventual data abbreviations = {} # We first get the contents of the page and turn it into a beautiful soup object abbreviations_url = "http://public.oed.com/how-to-use-the-oed/abbreviations/" req = requests.get(abbreviations_url) soup = bs4.BeautifulSoup(req.text) # Now we iterate through all of the <tr> tags (table rows) since that's where the abbreviations are trs = soup.find_all("tr") for tr in trs: # We find how many cells are in the row tds = tr.find_all("td") # If the row doesn't have two cells we skip it # because there are some rows with just one cell that # act as headers if len(tds) == 2: abbreviations[tds[0].text] = tds[1].text # We save the abbreviations to a file and then return them as well in case the caller wants them right away with open(util.path_to_data_directory() + "abbreviations.json", "w") as f: json.dump(abbreviations, f) return abbreviations
def get_cmudict(): """ Fetches the CMU pronunciations dictionary from their servers and returns the text (split by line) :return: list of lines in cmudict """ req = requests.get("http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b") with open(util.path_to_data_directory() + "cmudict-07.b", "w") as f: return req.text.split('\n')
def load_pronunciations(): """ Loads the pronunciations dictionary from pronunciations.json and returns it :return: The pronunciations dictionary """ with open(util.path_to_data_directory() + "pronunciations.json", "r") as prounciations_file: pronunciations = json.load(prounciations_file) return pronunciations
def load_abbreviations(): """ Loads the abbreviations file and returns the dictionary :return: Abbreviations dictionary of `abbreviation`:`meaning` pairs """ # Check if the file exists if os.path.isfile(util.path_to_data_directory() + "abbreviations.json"): # The file exists with open(util.path_to_data_directory() + "abbreviations.json", "r") as f: abbreviations = json.load(f) else: # The file does not exist raise FileNotFoundError( util.path_to_data_directory() + "abbreviations.json doesn't exist. Try get_abbreviations() first.") return abbreviations
def learn(self, train_docs, train_labels): """ Train on the provided data with the perceptron algorithm. Up to self.MAX_ITERATIONS of learning. At the end of training, self.weights should contain the final model parameters. """ for i in range(self.MAX_ITERATIONS): updates = 0 for t in range(len(train_docs)): doc = train_docs[t] pred = self.predict(doc) gold = train_labels[t] if pred != gold: self.weights[gold] += doc self.weights[pred] -= doc self.biases[gold] += 1 updates += 1 trainAcc = self.test_eval(train_docs, train_labels) #devAcc = self.test_eval(dev_docs, dev_labels) print('iteration:', i, 'updates:', updates, 'trainAcc:', trainAcc, file=sys.stderr) if updates == 0: break with open(util.path_to_data_directory() + 'weights_5.pk', 'wb') as wfile: pickle.dump(self.weights, wfile, 3) with open(util.path_to_data_directory() + 'biases_5.pk', 'wb') as bfile: pickle.dump(self.biases, bfile, 3) return
guess = [phon] + guess if len(stress) > 0: for num in stress[::-1]: phon = 'UU' + str(num) # fake vowel guess = [phon] + guess return guess if __name__ == "__main__": '''for testing''' CMUDICT = resources.cmudict() OOV = defaultdict(int) with open(util.path_to_data_directory() + "pokemon-comments.txt") as infile: filestring = infile.read() tokens = nltk.word_tokenize(filestring) for tok in tokens: tok = tok.upper() if tok not in CMUDICT: OOV[tok] += 1 OOV = sorted(OOV, key=OOV.get, reverse=True) for tok in OOV: if tok.isalnum(): print(tok, str(guess_pron(tok, CMUDICT=CMUDICT)))