def create_index_old(files):
    """
    Given a list of fully-qualified filenames, build an index from word
    to set of document IDs. A document ID is just the index into the
    files parameter (indexed from 0) to get the file name. Make sure that
    you are mapping a word to a set of doc IDs, not a list.
    For each word w in file i, add i to the set of document IDs containing w
    Return a dict object mapping a word to a set of doc IDs.
    """
    # create massive list of all words from ALL FILES
    allWords = []
    for file in files:
        allWords.append(
            get_text(file))  # extract file contents as massive strings

    wordsInAllDocuments = [toUnique(words(f)) for f in allWords
                           ]  # convert to words per document (used later)
    allWords = words(" ".join(allWords))  # convert strings into list of words
    allWords = toUnique(allWords)  # make it unique (i.e. no duplicate words)

    # iterate through words and generate index

    dictionary = {w: set() for w in allWords}

    for word in allWords:  # loop through all unique words
        for i, wordsInOneDocument in enumerate(
                wordsInAllDocuments):  # loops through all files
            if word in wordsInOneDocument:
                dictionary[word].add(i + 1)
    return dictionary
Exemplo n.º 2
0
def dotest(terms, expected, which):
    files = filelist(rootdir)
    terms = words(terms)
    # print(terms)

    if which == 0:
        linear_docs = linear_search(files, terms)
        # print(filenames(linear_docs))
        names = filenames(linear_docs)
        names.sort()
        expected.sort()	
        #assert filenames(linear_docs) == expected
        assert names == expected, "found "+str(names)+" != expected "+str(expected)
    elif which == 1:
        index = create_index(files)
        index_docs = index_search(files, index, terms)
        # print(filenames(index_docs))
        names = filenames(index_docs)
        names.sort()
        expected.sort()
        #assert filenames(index_docs) == expected
        assert names == expected, "found "+str(names)+" != expected "+str(expected)
    else:
        index = myhtable_create_index(files)
        index_docs = myhtable_index_search(files, index, terms)
        # print(filenames(index_docs))
        names = filenames(index_docs)
        names.sort()
        expected.sort()
        #assert filenames(index_docs) == expected
        assert names == expected, "found "+str(names)+" != expected "+str(expected)
Exemplo n.º 3
0
def create_index(files):
    """
    Given a list of fully-qualified filenames, build an index from word
    to set of document IDs. A document ID is just the index into the
    files parameter (indexed from 0) to get the file name. Make sure that
    you are mapping a word to a set of doc IDs, not a list.
    For each word w in file i, add i to the set of document IDs containing w
    Return a dict object mapping a word to a set of doc IDs.
    """
    document_ID = {}
    index = {}
    for i in range(0, len(files)):
        document_ID[files[i]] = i
    terms = []

    for file in files:
        terms = get_text(file)
        terms = words(terms)
        for term in terms:
            if index.__contains__(term) == True:
                index[term].add(document_ID[file])

            else:
                index[term] = {document_ID[file]}
    return index
Exemplo n.º 4
0
	def getCategories (self, filename):
		categories = []
		categoriesFile = open(filename+'categories.txt','r')
		for line in categoriesFile:
			categories.append (line)

		for category in categories:
			if '\n' in category:
				category = category[:len(category)-1]
			cFile = open (filename+category, 'r')
			cFileList = []
			for line in cFile:
				if '\n' in line:
					line = line[:len(line)-1]
				if '\r' in line:
					line = line[:len(line)-1]
				cFileList.append (line)

			if category == 'stopwords.txt':
				self.words = words (cFileList)

			categories = {}
			categories[category.split('.')[0]] = cFileList
			categories['stopwords'] = self.words.listToDict(categories['stopwords'])
			# self.categoryOccurrences[category.split('.')[0]] = 0

			return categories
Exemplo n.º 5
0
def dotest(terms, expected, which):
    files = filelist(rootdir)
    terms = words(terms)
    # print(terms)

    if which == 0:
        linear_docs = linear_search(files, terms)
        # print(filenames(linear_docs))
        names = filenames(linear_docs)
        names.sort()
        expected.sort()	
        #assert filenames(linear_docs) == expected
        assert names == expected, "found "+str(names)+" != expected "+str(expected)
    elif which == 1:
        index = create_index(files)
        index_docs = index_search(files, index, terms)
        # print(filenames(index_docs))
        names = filenames(index_docs)
        names.sort()
        expected.sort()
        #assert filenames(index_docs) == expected
        assert names == expected, "found "+str(names)+" != expected "+str(expected)
    else:
        index = myhtable_create_index(files)
        index_docs = myhtable_index_search(files, index, terms)
        # print(filenames(index_docs))
        names = filenames(index_docs)
        names.sort()
        expected.sort()
        #assert filenames(index_docs) == expected
        assert names == expected, "found "+str(names)+" != expected "+str(expected)
Exemplo n.º 6
0
 def test_word_occurance8(self):
     self.assertDictEqual({
         'hello': 1,
         'world': 1
     },
                          words('hello\nworld'),
                          msg='should not count multilines')
Exemplo n.º 7
0
 def test_word_occurance9(self):
     self.assertDictEqual({
         'hello': 1,
         'world': 1
     },
                          words('hello\tworld'),
                          msg='should not count tabs')
Exemplo n.º 8
0
 def test_word_occurance0(self):
     self.assertDictEqual({
         'hello': 1,
         'world': 1
     },
                          words('hello  world'),
                          msg='should count multiple spaces as one')
Exemplo n.º 9
0
def create_index(files):
    """
    Given a list of fully-qualified filenames, build an index from word
    to set of document IDs. A document ID is just the index into the
    files parameter (indexed from 0) to get the file name. Make sure that
    you are mapping a word to a set of doc IDs, not a list.
    For each word w in file i, add i to the set of document IDs containing w
    Return a dict object mapping a word to a set of doc IDs.
    """

    dct_index = defaultdict()  # Create an empty dict
    for idx, fname in enumerate(
            files):  # Iterate through every given file names
        s_content = get_text(
            fname)  # Turn each file name into a string content
        lst_word = words(
            s_content
        )  # Turn the string content into a list of normalized words
        for word in lst_word:  # For each normalized words, update the dict by word-file as key-value pairs
            if word not in dct_index:
                dct_index[word] = {idx}  # If the key doesn't exist, create one
            else:
                dct_index[word].add(
                    idx
                )  # If the key exist, add the file name into the set of the file names under that word

    return dct_index
Exemplo n.º 10
0
def myhtable_create_index(files):
    """
    Build an index from word to set of document indexes
    This does the exact same thing as create_index() except that it uses
    your htable.  As a number of htable buckets, use 4011.
    Returns a list-of-buckets hashtable representation.
    """

    # dct_index = defaultdict() # Create an empty dict
    # for file in files: # Iterate through every given file names
    #     s_content = get_text(file) # Turn each file name into a string content
    #     lst_word = words(s_content) # Turn the string content into a list of normalized words
    #     for word in lst_word: # For each normalized words, update the dict by word-file as key-value pairs
            # if word not in dct_index:
            #     dct_index[word] = {file} # If the key doesn't exist, create one
            # else:
            #     dct_index[word].add(file) # If the key exist, add the file name into the set of the file names under that word
    # return dct_index


    NBUCKETS = 4011
    table = htable(NBUCKETS) # Create an empty dict
    for idx,fname in enumerate(files): # Iterate through every given file names
        s_content = get_text(fname) # Turn each file name into a string content
        lst_word = words(s_content) # Turn the string content into a list of normalized words
        for word in lst_word: # For each normalized words, update the dict by word-file as key-value pairs
            set_IDs = htable_get(table, word)
            if set_IDs == None:
                htable_put(table, word, {idx}) # index or file name
            else:
                set_IDs.add(idx)
    return table
Exemplo n.º 11
0
 def test_word_occurance2(self):
     self.assertDictEqual({
         'one': 1,
         'of': 1,
         'each': 1
     },
                          words("one of each"),
                          msg='should count one of each')
Exemplo n.º 12
0
 def test_word_occurance5(self):
     self.assertDictEqual({
         'testing': 2,
         1: 1,
         2: 1
     },
                          words('testing 1 2 testing'),
                          msg='should include numbers')
Exemplo n.º 13
0
 def test_word_occurance6(self):
     self.assertDictEqual({
         'go': 1,
         'Go': 1,
         'GO': 1
     },
                          words('go Go GO'),
                          msg='should respect case')
Exemplo n.º 14
0
        def test_01(self):
            """String vacia"""

            test_string = ""

            for word in words(test_string):
                self.result.append(word)

            self.assertEquals(len(self.results), 0, "Falla: " + self.__doc__)
Exemplo n.º 15
0
def linear_search(files, terms):
    returnFiles = []
    searchTerms = set(terms)
    for item in files:
        fileWords = set(words(get_text(item)))
        if(searchTerms < fileWords):
            returnFiles.append(item)


    return returnFiles
Exemplo n.º 16
0
 def test_word_occurance7(self):
     self.assertDictEqual(
         {
             "¡Hola!": 1,
             "¿Qué": 1,
             "tal?": 1,
             "Привет!": 1
         },
         words('¡Hola! ¿Qué tal? Привет!'),
         msg='should count international characters properly')
Exemplo n.º 17
0
def myhtable_create_index(files):
    wordBook = htable(4011)

    fileIndex = 0
    for item in files:
        fileWords = set(words(get_text(item)))
        for word in fileWords:
            htable_put(wordBook,word, fileIndex)
        fileIndex += 1
    return wordBook
Exemplo n.º 18
0
def test_linear_berlitz():
    terms = "hawaii travel"

    files = filelist(rootdir)

    terms = words(terms)

    linear_docs = linear_search(files, terms)

    expected = ['HistoryHawaii.txt']
    assert filenames(linear_docs) == expected
Exemplo n.º 19
0
def savedoc(suborgid,orgid,sourceurl,documentdate,name,dochash,pdftext,tokens,orphaned):
    scrapedate = time.strftime('%Y-%m-%d')
    doc = documents()
    docid = doc.add(suborgid,orgid,sourceurl,documentdate,scrapedate,name,dochash,orphaned)
    doct = documenttexts()
    doct.add(docid,pdftext)
    wrds = words()
    for token,frequency in tokens.items():
        if len(token) > 3:
            wrds.add(docid,suborgid,orgid,token,frequency)
    return docid
def create_index(files):
    d = {}
    for k, file in enumerate(files):  # loop through files
        wordsInDoc = words(get_text(file))
        for word in wordsInDoc:  # loop through words in that file
            if word not in d:
                d[word] = {files[k]}
            else:
                d[word].add(files[k])
            # print("word {:d} ({:<14s}), doc {:d}".format(i+1, word, k)) # warning: x6 runtime!
    return d
Exemplo n.º 21
0
def test_linear_berlitz():
    terms = "hawaii travel"

    files = filelist(rootdir)

    terms = words(terms)

    linear_docs = linear_search(files, terms)

    expected = ['HistoryHawaii.txt']
    assert filenames(linear_docs)==expected
Exemplo n.º 22
0
def test_linear_berlitz_none():
    terms = "missspellinnng"

    files = filelist(rootdir)

    terms = words(terms)

    linear_docs = linear_search(files, terms)

    expected = []
    assert filenames(linear_docs)==expected
Exemplo n.º 23
0
 def test_word_occurance3(self):
     self.assertDictEqual(
         {
             'one': 1,
             'fish': 4,
             'two': 1,
             'red': 1,
             'blue': 1
         },
         words("one fish two fish red fish blue fish"),
         msg='should count multiple occurrences')
Exemplo n.º 24
0
def test_linear_berlitz_none():
    terms = "missspellinnng"

    files = filelist(rootdir)

    terms = words(terms)

    linear_docs = linear_search(files, terms)

    expected = []
    assert filenames(linear_docs) == expected
Exemplo n.º 25
0
def test_myhtable_berlitz():
    terms = "hawaii travel"

    files = filelist(rootdir)

    terms = words(terms)

    index = myhtable_create_index(files)
    myhtable_docs = myhtable_index_search(files, index, terms)

    expected = ['HistoryHawaii.txt']
    assert filenames(myhtable_docs) == expected
Exemplo n.º 26
0
def test_myhtable_berlitz_none():
    terms = "missspellinnng"

    files = filelist(rootdir)

    terms = words(terms)

    index = myhtable_create_index(files)
    myhtable_docs = myhtable_index_search(files, index, terms)

    expected = []
    assert filenames(myhtable_docs) == expected
Exemplo n.º 27
0
def test_myhtable_berlitz_none():
    terms = "missspellinnng"

    files = filelist(rootdir)

    terms = words(terms)

    index = myhtable_create_index(files)
    myhtable_docs = myhtable_index_search(files, index, terms)

    expected = []
    assert filenames(myhtable_docs) == expected
Exemplo n.º 28
0
def test_myhtable_berlitz():
    terms = "hawaii travel"

    files = filelist(rootdir)

    terms = words(terms)

    index = myhtable_create_index(files)
    myhtable_docs = myhtable_index_search(files, index, terms)

    expected = ['HistoryHawaii.txt']
    assert filenames(myhtable_docs) == expected
Exemplo n.º 29
0
 def test_word_occurance4(self):
     self.assertDictEqual(
         {
             'car': 1,
             ":": 2,
             'carpet': 1,
             'as': 1,
             'java': 1,
             'javascript!!&@$%^&': 1
         },
         words('car : carpet as java : javascript!!&@$%^&'),
         msg='should include punctuation')
Exemplo n.º 30
0
def demo(model, user_input=None, **params):
    batch_size = params['batch_size']
    # Data for demo prediction
    X, Y = get_batch(**params)
    if user_input:
        X[1][0] = words.indices(user_input)
    preds = model.predict(X)
    print("Target (left) vs. Network Output (right):")
    input_pixels, input_words = X[0][0], X[1][0]
    print(words.words(input_words))
    left = input_pixels + Y[0] * 255.
    right = input_pixels + map_to_img(preds[0], **params)
    imutil.show(np.concatenate((left, right), axis=1))
Exemplo n.º 31
0
def linear_search(files, terms):
    """
    Given a list of fully-qualified filenames, return a list of them
    whose file contents has all words in terms as normalized by your words() function.
    Parameter terms is a list of strings.
    Perform a linear search, looking at each file one after the other.
    """
    final_list = []
    set_terms = set(terms)
    for article in files:
    	data = get_text(articles)
    	new_data = set(words(data))
    	if set_terms.issubset(new_data):
    		final_list.append(articles)
    return final_list
Exemplo n.º 32
0
def linear_search(files, terms):
    """
    Given a list of fully-qualified filenames, return a list of them
    whose file contents has all words in terms as normalized by your words() function.
    Parameter terms is a list of strings.
    Perform a linear search, looking at each file one after the other.
    """
    result = []
    for file in files:
        contents = get_text(file)
        contents = words(contents)
        terms = pd.Series(terms)
        if all(terms.isin(contents)) == True:
            result.append(file)
    return result
Exemplo n.º 33
0
def start_game(player_list: list, phrase: str):
    wheel_img_folder_path = os.path.join(__location__, const.IMG_FOLDER,
                                         const.IMG_BASE_NAME)
    #Call game constructor and pass the phrase for game instance variable
    ready = game(phrase)
    #Load up wheel image
    w = wheel(wheel_img_folder_path)
    #Shows wheel image and pointer
    w.start()
    #Words constructor
    wor = words(phrase)
    #Print out blank words
    wor.start(w)
    #Call start_wof from game object
    ready.start_wof(player_list, w, wor)
Exemplo n.º 34
0
def myhtable_create_index(files):
    """
    Build an index from word to set of document indexes
    This does the exact same thing as create_index() except that it uses
    your htable.  As a number of htable buckets, use 4011.
    Returns a list-of-buckets hashtable representation.
    """
    nbuckets = 4011
    table = htable(nbuckets)
    for value in range(0, len(files)):
        terms = get_text(files[value])
        terms = words(terms)
        for key in terms:
            table = htable_put(table, key, {value})

    return table
Exemplo n.º 35
0
def predict(model, x_global, x_local, x_ctx, box, **params):
    max_words = params['max_words']
    # An entire batch must be run at once, but we only use the first slot in that batch
    indices = util.left_pad([words.START_TOKEN_IDX], **params)
    x_global = util.expand(x_global, 1)
    x_local = util.expand(x_local, 1)
    indices = util.expand(indices, 1)
    x_ctx = util.expand(x_ctx, 1)

    # Input is empty padding followed by start token
    output_words = []
    for i in range(1, max_words):
        preds = model.predict([x_global, x_local, indices, x_ctx])
        indices = np.roll(indices, -1, axis=1)
        indices[:, -1] = np.argmax(preds[:], axis=1)

    return words.words(indices[0])
Exemplo n.º 36
0
def linear_search(files, terms):
    """
    Given a list of fully-qualified filenames, return a list of them
    whose file contents has all words in terms as normalized by your words() function.
    Parameter terms is a list of strings.
    Perform a linear search, looking at each file one after the other.
    """

    # path = "~/data/slate"
    # for path_name, subdir, f_name in os.walk(path):
    #     for f in files:
    #         if f in files:
    #             path
    #             p = os.path.join(path_name, f_name) # Can't use "os."
    #             s = get_text(f)
    #             print(s)
    all_in = True
    lst_qualified = []
    for idx, file in enumerate(files):

        # print(idx, f)
        # s = get_text(f) # Use the existing function words()
        # for term in terms:
        #     if term not in s: # Check if all the terms are contained in the file
        #         all_in = False
        # if all_in == True: # Then this file is fully-qualified
        #     lst_qualified.append(file)
        #     print("!!!!!!")
        # print(idx, f)

        # with open(file) as f:
        #     lst = f.readlines()
        #     for line in lst:
        #         for term in terms:
        #             if term not in : # Check if all the terms are contained in the file
        #                 all_in = False
        #         if all_in == True: # Then this file is fully-qualified
        #             lst_qualified.append(file)
        #             print("!!!!!!")

        if set(terms) == set(words(get_text(file))).intersection(set(terms)):
            lst_qualified.append(file)
            # print("!!!!!!")

    return lst_qualified
Exemplo n.º 37
0
def linear_search(files, terms):
    """
    Given a list of fully-qualified filenames, return a list of them
    whose file contents has all words in terms as normalized by your words() function.
    Parameter terms is a list of strings.
    Perform a linear search, looking at each file one after the other.
    """
    listOfFiles = []

    for file in files:
        # convert to list of words
        allWordsInFile = words(get_text(file))

        # check to see if the search terms are subsets of the file words
        if set(terms).issubset(allWordsInFile):
            listOfFiles.append(file)

    return (listOfFiles)
Exemplo n.º 38
0
def create_index(files):
    """
    Given a list of fully-qualified filenames, build an index from word
    to set of document IDs. A document ID is just the index into the
    files parameter (indexed from 0) to get the file name. Make sure that
    you are mapping a word to a set of doc IDs, not a list.
    For each word w in file i, add i to the set of document IDs containing w
    Return a dict object mapping a word to a set of doc IDs.
    """
    if len(files) <= 0:
        return None

    index = defaultdict(set)
    for i in range(len(files)):
        file_content = get_text(files[i])
        key_words = words(file_content)
        for word in key_words:
            index[word].add(i)
    return index
Exemplo n.º 39
0
def myhtable_create_index(files):
    """
    Build an index from word to set of document indexes
    This does the exact same thing as create_index() except that it uses
    your htable.  As a number of htable buckets, use 4011.
    Returns a list-of-buckets hashtable representation.
    """
    wordlist = [words(get_text(files[i])) for i in range(len(files))]
    table = htable(4011)

    for i in range(len(files)):
        for j in range(len(wordlist[i])):
            htable_put(table, wordlist[i][j], set())

    for i in range(len(files)):

        for j in range(len(wordlist[i])):

            htable_get(table, wordlist[i][j]).add(i)
    return table
Exemplo n.º 40
0
def main():
    
    pygame.init();
    screen = pygame.display.set_mode((1024,768));
    font = pygame.font.Font("./font/crayon.ttf", 70);
    drawtony(screen);
    w = words.words();
    w.init();
    while(1):
        input = raw_input();
        input = unicode(input, "utf-8");
        #print(input);
        if input == u"exit":
            end(screen);
        if input == u"さようなら":
            end(screen);
        result = w.check(input);
        #print(result);
        if result != -1:
            ny = font.render(w.ans[result] + u"ニー!", True, (255,255,255));
            screen.blit(ny, (70, 524 + 188 / 2 - 10));
            pygame.display.update();
            pygame.time.wait(2000);
            drawtony(screen);
            continue;
        say = font.render(input + u"ってなあニー?", True, (255,255,255));
        screen.blit(say, (70, 524 + 188 / 2 - 10));
        pygame.display.update();
        input2 = raw_input();
        input2 = unicode(input2, "utf-8");
        teach = font.render(input2 + u"かあ、ありがトニー!", True, (255,255,255));
        drawtony(screen);
        screen.blit(teach, (70, 524 + 188 / 2 - 10));
        den = pygame.image.load("./image/den.png").convert();
        screen.blit(den, (455, 100));
        w.input(input, input2);
        pygame.display.update();
        pygame.time.wait(2000);
        drawtony(screen);
Exemplo n.º 41
0
def main():
    tweets = []

    # read from db
    reader  = streamReader()
    trends = reader.getTrends()
    # feed contents with each tag into words to form dictionary
    for tr in trends:
        data = reader.getTweets(tr)
        learner = words()
        learnt_sentence = 0
        for text in data:
            learner.add_sentence(text)
            learnt_sentence += 1
        if learnt_sentence == 0:
            continue
        tweets.append(learner.gen_sentence(140))
        print learner.gen_sentence(140)

    # generate the sentence within the length limit of tweets

    # output method
    if ONLINE:
        post_tweets(tweets)
Exemplo n.º 42
0
$ python search.py myhtable ~/data/slate
"""

impl = sys.argv[1]
rootdir = sys.argv[2]
files = filelist(rootdir)
# Uncomment the next line to test just the first 100 files instead of all files
# files = files[:100]
N = len(files)
print(N, "files")

index = None

while True:
    terms = input("Search terms: ")
    terms = words(terms)

    if impl=='linear':
        docs = linear_search(files, terms)
    elif impl == 'index':
        if index is None:
            index = create_index(files)
            print("Index complete")
        docs = index_search(files, index, terms)
    elif impl == 'myhtable':
        if index is None:
            index = myhtable_create_index(files)
            print("Index complete")
        docs = myhtable_index_search(files, index, terms)
    else:
        print("Invalid search type:", impl)
Exemplo n.º 43
0
import sys
import pickle
from words import words
from quote import *
from markov import *
from frequency import probabilities, make_word_walker

quote_length = int(sys.argv[1])
algorithm_selection = sys.argv[2]
files = sys.argv[3:len(sys.argv)]
prefix_length = 2

if len(files) == 1 and files[0].split('.')[-1] == "pickle":
    source = pickle.load(open(files[0], "rb"))
    if algorithm_selection == 'markov':
        walker = make_prefix_walker(source)
    elif algorithm_selection == 'freq':
        walker = make_word_walker(source)
else:
    corpus = words(files)
    if algorithm_selection == 'markov':
        source = markov(corpus, prefix_length)
        walker = make_prefix_walker(source)
    elif algorithm_selection == 'freq':
        source = probabilities(corpus)
        walker = make_word_walker(source)

print( quote(quote_length, walker) )
Exemplo n.º 44
0
def savewords(tokens,articleid,sourceid,thedate):
    wrds = words()
    for token,frequency in tokens.items():
        if len(token) > 4:
            wrds.add(token,frequency,articleid,sourceid,thedate)
Exemplo n.º 45
0
# -*- coding: UTF-8 -*-
from words import words
from flask import Flask,render_template, session, redirect, url_for, flash
from flask.ext.bootstrap import Bootstrap
from forms.validateForm import validateForm
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
app = Flask(__name__)
bootstrap = Bootstrap(app)
app.config['SECRET_KEY'] = 'zhaoyuan work'


WORDS = words()
WORDS.get_words_from_db('select * from word')


@app.route('/')
def hello_world():
	return 'Hello world'

@app.route('/play',methods=['get','post'])
def test_validate():
	form = validateForm()
	if form.validate_on_submit():
		# The first one of the data This is only test
		words = form.words.data
		meaning = form.meaning.data
		if not WORDS.match_words(words,meaning):
			flash('哦吼,%s的意思没有输入对哦' % words)
		session['words'] = words