Exemplo n.º 1
0
def gen_mnemonic(corpus_path, input_string):
    first_letters = gen_input(input_string)
    corpus = open(corpus_path, encoding='utf8').read()
    corpus = corpus.lower()

    tokens = nltk.WhitespaceTokenizer().tokenize(corpus)
    for i in tokens:
        i = i.lower()

    tagged = nltk.pos_tag([i for i in tokens if i], tagset='universal')
    markov_dict, tag_dict = gen_dicts(tagged)

    # initialize sequence
    init_wordpool = []
    for pair in markov_dict:
        if pair[0][0] == first_letters[0]:
            init_wordpool.append(pair)

    first_word = random.choice(init_wordpool)

    mnemonic = [first_word]
    # check if next beginning letter in sequence is part of markov sequence for
    # previous word
    for i in range(1, len(first_letters)):
        choices = []
        markov_chain = markov_dict.get(mnemonic[i - 1])
        # sort markov_chain
        markov_chain = sorted(markov_chain, key=markov_chain.get)

        tag_chain = tag_dict.get(mnemonic[i - 1][1])
        tag_chain = sorted(tag_chain, key=tag_chain.get)

        for pair in markov_chain:
            if pair[0][0] == first_letters[i]:
                choices.append(pair)

        for pair in markov_dict:
            # go through everything in tag_dict
            for k in range(0, MARKOV_ORDER - 1):
                if pair[0][0] == first_letters[i] and pair[1] == tag_chain[k]:
                    choices.append(pair)

        random_flag = 0
        if not choices:
            random_flag += 1
            for pair in markov_dict:
                if pair[0][0] == first_letters[i]:
                    choices.append(pair)

        if random_flag == 0:
            mnemonic.append(choices[0])
        else:
            mnemonic.append(random.choice(choices))

    out = [i[0] for i in mnemonic]
    return " ".join(out)
Exemplo n.º 2
0
def snowBallStemmer(text):

    tokens = nltk.WhitespaceTokenizer().tokenize(text)
    stems = []
    stemmer = SnowballStemmer("english")
    for token in tokens:
        token = stemmer.stem(token)
        if token != "":
            stems.append(token)
    return stems
Exemplo n.º 3
0
    def evaulate(taggedfile, reffile):

        tagged_content = open(taggedfile)
        reffile_content = open(reffile)

        totaltokens = 0
        totalKnowns = 0
        totalUnknowns = 0
        unknownCorrect = 0
        knownCorrect = 0
        delimiter = [">>"]

        for taggedline, refline in izip(tagged_content, reffile_content):
            taggedtoken = nltk.WhitespaceTokenizer().tokenize(taggedline)
            reftoken = nltk.WhitespaceTokenizer().tokenize(refline)

            totaltokens += len(taggedtoken)  # get the total number of tokens

            for index, token in enumerate(taggedtoken):
                taggedtag = token.split("/")[1]
                # if unknown tag
                if ">>" in [
                        delimit for delimit in delimiter
                        if delimit in taggedtag
                ]:
                    taggedtag = taggedtag.rstrip(">>")
                    totalUnknowns += 1
                    if taggedtag == reftoken[index].split("/")[1]:
                        unknownCorrect += 1
                else:
                    totalKnowns += 1
                    if taggedtag == reftoken[index].split("/")[1]:
                        knownCorrect += 1

        print "\n----------Results----------"
        print "Overall Accuracy: " + str(
            (knownCorrect + unknownCorrect) / float(totaltokens))
        print "Known Accuracy: " + str(knownCorrect / float(totalKnowns))
        print "Unknown Accuracy: " + str(unknownCorrect / float(totalUnknowns))
        print "\n"
Exemplo n.º 4
0
    def tokenization(fpath):

        pos = {}
        no_of_tags = 0
        word_tag = {}
        transition = {}

        starttags = ["<s>/<s>"]  # Dummy start symbol
        endtags = ["<e>/<e>"]  # Dummy end symbol

        file_content = open(fpath)

        for line in file_content.readlines():
            tokens = starttags + nltk.WhitespaceTokenizer().tokenize(
                line) + endtags

            for index, token in enumerate(tokens):  # Create the dictionary

                # Increment the No_of_tags by 1
                no_of_tags += 1

                # Add the <word tag: count> to dictionary
                word = token.split("/")[0]
                tag = token.split("/")[1]
                if word + " " + tag in word_tag:
                    word_tag[word + " " + tag] += 1
                else:
                    word_tag[word + " " + tag] = 1

                # Add the pos occurrence to dictionary
                if tag in pos:
                    pos[tag] += 1
                else:
                    pos[tag] = 1

                # Get the transition tags
                if index < len(tokens) - 1:
                    tag1 = tokens[index].split("/")[1]
                    tag2 = tokens[index + 1].split("/")[1]
                    if (tag1 + " " + tag2) in transition:
                        transition[tag1 + " " + tag2] += 1
                    else:
                        transition[tag1 + " " + tag2] = 1
        # tags dictionary, transition dictionary, word_tag dictionary, no of tags in the file
        token_results = [pos, transition, word_tag, no_of_tags]

        return token_results
Exemplo n.º 5
0
 def tokenize(text):
     """
     Tokenizes sequences of text and stems the tokens.
     :param text: String to tokenize
     :return: List with stemmed tokens
     """
     tokens = nl.WhitespaceTokenizer().tokenize(text)
     tokens = list(set(re.sub("[^a-zA-Z\']", "", token) for token in tokens))
     tokens = [word for word in tokens if word not in stopwords.words('english')]
     tokens = list(set(re.sub("[^a-zA-Z]", "", token) for token in tokens))
     stems = []
     stemmer = SnowballStemmer("english")
     for token in tokens:
         token = stemmer.stem(token)
         if token != "":
             stems.append(token)
     return stems
def pre_process(comment_text):
    comment_text = re.sub(" n't", "n't", comment_text)
    comment_text = re.sub(" 's", "", comment_text)
    comment_text = re.sub(" 'd", "'d", comment_text)
    comment_text = re.sub(" 're", "'re", comment_text)
    comment_text = re.sub("-LRB-", "(", comment_text)
    comment_text = re.sub("-RRB-", ")", comment_text)
    comment_text = re.sub('\W', ' ', comment_text)
    comment_text = re.sub('\s+', ' ', comment_text)
    comment_text = re.sub(r'[0-9]+', '', comment_text)
    tokenizer = nltk.WhitespaceTokenizer()
    tokens = tokenizer.tokenize(comment_text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    stemmed = nltk.tag.pos_tag(tokens)
    no_stopwords = []
    for token, tag in stemmed:
        token = token.lower()
        if tag != 'NNP' and tag != 'NNPS' and token not in stop_words and len(token) > 2:
            no_stopwords.append(token)

    processed = ' '.join(no_stopwords)
    processed = processed.strip(' ')
    return processed