示例#1
0
 def __init__(self,
              train_docs=None,
              train_labels=None,
              MAX_ITERATIONS=100,
              dev_docs=None,
              dev_labels=None,
              weights=None,
              biases=None):
     pre_syll = ['']
     self.CLASSES = ['too_long']
     for i in range(1, 6):
         next_syll = add_syll(pre_syll)
         pre_syll = next_syll
         self.CLASSES += next_syll
     #print(self.CLASSES)
     self.MAX_ITERATIONS = MAX_ITERATIONS
     if weights is not None and biases is not None:
         with open(util.path_to_data_directory() + weights, 'rb') as wfile:
             self.weights = pickle.load(wfile)
         with open(util.path_to_data_directory() + biases, 'rb') as bfile:
             self.biases = pickle.load(bfile)
     else:
         #self.dev_docs = dev_docs
         #self.dev_labels = dev_labels
         self.weights = {l: Counter() for l in self.CLASSES}
         self.biases = {l: 0 for l in self.CLASSES}
         self.learn(train_docs, train_labels)
示例#2
0
def clean_abbreviations():
    """
    Loads the abbreviations from disk
    Selects the first word from the potential list of matches:
        "Temp. -> Temporal, Temporary" is converted to "Temp. -> Temporal"
    Gets rid of any optional parts of the abbreviation:
        "Improp. -> Improper(ly)" is converted to "Improp. -> Improper"
    :return:
    """
    abbreviations = load_abbreviations()
    paren_pattern = re.compile("[(]\w+[)]", flags=re.UNICODE)
    for abbreviation in abbreviations:
        # If things are a list of potential matches, take the first one
        if "," in abbreviations[abbreviation]:
            abbreviations[abbreviation] = abbreviations[abbreviation].split(
                ",")[0].strip()
        # Remove everything in parentheses
        abbreviations[abbreviation] = paren_pattern.sub(
            r'', abbreviations[abbreviation])
    # Convert everything to uppercase
    uppercase_abbreviations = {}
    for abbreviation in abbreviations:
        uppercase_abbreviations[
            abbreviation.upper()] = abbreviations[abbreviation].upper()
    # Write the new thing to disk so that we don't have to do this again
    with open(util.path_to_data_directory() + "abbreviations.json", "w") as f:
        json.dump(uppercase_abbreviations, f)
    return uppercase_abbreviations
示例#3
0
def get_abbreviations():
    """
    Calls to the Oxford English Dictionary abbreviations page and parses out
    the abbreviations into a dictionary of `abbreviation`:`meaning` pairs.

    Also writes the dictionary of abbreviations to a local json file called abbreviations.json in the data folder
    :return: The dictionary of abbreviations
    """
    # We'll use this to store the eventual data
    abbreviations = {}
    # We first get the contents of the page and turn it into a beautiful soup object
    abbreviations_url = "http://public.oed.com/how-to-use-the-oed/abbreviations/"
    req = requests.get(abbreviations_url)
    soup = bs4.BeautifulSoup(req.text)
    # Now we iterate through all of the <tr> tags (table rows) since that's where the abbreviations are
    trs = soup.find_all("tr")
    for tr in trs:
        # We find how many cells are in the row
        tds = tr.find_all("td")
        # If the row doesn't have two cells we skip it
        # because there are some rows with just one cell that
        # act as headers
        if len(tds) == 2:
            abbreviations[tds[0].text] = tds[1].text
    # We save the abbreviations to a file and then return them as well in case the caller wants them right away
    with open(util.path_to_data_directory() + "abbreviations.json", "w") as f:
        json.dump(abbreviations, f)
    return abbreviations
示例#4
0
def get_cmudict():
    """
    Fetches the CMU pronunciations dictionary from their servers and returns the text (split by line)
    :return: list of lines in cmudict
    """
    req = requests.get("http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b")
    with open(util.path_to_data_directory() + "cmudict-07.b", "w") as f:
        return req.text.split('\n')
示例#5
0
def load_pronunciations():
    """
    Loads the pronunciations dictionary from pronunciations.json and returns it
    :return: The pronunciations dictionary
    """
    with open(util.path_to_data_directory() + "pronunciations.json", "r") as prounciations_file:
        pronunciations = json.load(prounciations_file)
    return pronunciations
示例#6
0
def load_abbreviations():
    """
    Loads the abbreviations file and returns the dictionary
    :return: Abbreviations dictionary of `abbreviation`:`meaning` pairs
    """
    # Check if the file exists
    if os.path.isfile(util.path_to_data_directory() + "abbreviations.json"):
        # The file exists
        with open(util.path_to_data_directory() + "abbreviations.json",
                  "r") as f:
            abbreviations = json.load(f)
    else:
        # The file does not exist
        raise FileNotFoundError(
            util.path_to_data_directory() +
            "abbreviations.json doesn't exist. Try get_abbreviations() first.")
    return abbreviations
示例#7
0
 def learn(self, train_docs, train_labels):
     """
     Train on the provided data with the perceptron algorithm.
     Up to self.MAX_ITERATIONS of learning.
     At the end of training, self.weights should contain the final model
     parameters.
     """
     for i in range(self.MAX_ITERATIONS):
         updates = 0
         for t in range(len(train_docs)):
             doc = train_docs[t]
             pred = self.predict(doc)
             gold = train_labels[t]
             if pred != gold:
                 self.weights[gold] += doc
                 self.weights[pred] -= doc
                 self.biases[gold] += 1
                 updates += 1
         trainAcc = self.test_eval(train_docs, train_labels)
         #devAcc = self.test_eval(dev_docs, dev_labels)
         print('iteration:',
               i,
               'updates:',
               updates,
               'trainAcc:',
               trainAcc,
               file=sys.stderr)
         if updates == 0:
             break
     with open(util.path_to_data_directory() + 'weights_5.pk',
               'wb') as wfile:
         pickle.dump(self.weights, wfile, 3)
     with open(util.path_to_data_directory() + 'biases_5.pk',
               'wb') as bfile:
         pickle.dump(self.biases, bfile, 3)
     return
示例#8
0
            guess = [phon] + guess

    if len(stress) > 0:
        for num in stress[::-1]:
            phon = 'UU' + str(num)  # fake vowel
            guess = [phon] + guess

    return guess


if __name__ == "__main__":
    '''for testing'''
    CMUDICT = resources.cmudict()
    OOV = defaultdict(int)

    with open(util.path_to_data_directory() +
              "pokemon-comments.txt") as infile:
        filestring = infile.read()
        tokens = nltk.word_tokenize(filestring)

        for tok in tokens:
            tok = tok.upper()
            if tok not in CMUDICT:
                OOV[tok] += 1

    OOV = sorted(OOV, key=OOV.get, reverse=True)

    for tok in OOV:
        if tok.isalnum():
            print(tok, str(guess_pron(tok, CMUDICT=CMUDICT)))