def create_index_old(files): """ Given a list of fully-qualified filenames, build an index from word to set of document IDs. A document ID is just the index into the files parameter (indexed from 0) to get the file name. Make sure that you are mapping a word to a set of doc IDs, not a list. For each word w in file i, add i to the set of document IDs containing w Return a dict object mapping a word to a set of doc IDs. """ # create massive list of all words from ALL FILES allWords = [] for file in files: allWords.append( get_text(file)) # extract file contents as massive strings wordsInAllDocuments = [toUnique(words(f)) for f in allWords ] # convert to words per document (used later) allWords = words(" ".join(allWords)) # convert strings into list of words allWords = toUnique(allWords) # make it unique (i.e. no duplicate words) # iterate through words and generate index dictionary = {w: set() for w in allWords} for word in allWords: # loop through all unique words for i, wordsInOneDocument in enumerate( wordsInAllDocuments): # loops through all files if word in wordsInOneDocument: dictionary[word].add(i + 1) return dictionary
def dotest(terms, expected, which): files = filelist(rootdir) terms = words(terms) # print(terms) if which == 0: linear_docs = linear_search(files, terms) # print(filenames(linear_docs)) names = filenames(linear_docs) names.sort() expected.sort() #assert filenames(linear_docs) == expected assert names == expected, "found "+str(names)+" != expected "+str(expected) elif which == 1: index = create_index(files) index_docs = index_search(files, index, terms) # print(filenames(index_docs)) names = filenames(index_docs) names.sort() expected.sort() #assert filenames(index_docs) == expected assert names == expected, "found "+str(names)+" != expected "+str(expected) else: index = myhtable_create_index(files) index_docs = myhtable_index_search(files, index, terms) # print(filenames(index_docs)) names = filenames(index_docs) names.sort() expected.sort() #assert filenames(index_docs) == expected assert names == expected, "found "+str(names)+" != expected "+str(expected)
def create_index(files): """ Given a list of fully-qualified filenames, build an index from word to set of document IDs. A document ID is just the index into the files parameter (indexed from 0) to get the file name. Make sure that you are mapping a word to a set of doc IDs, not a list. For each word w in file i, add i to the set of document IDs containing w Return a dict object mapping a word to a set of doc IDs. """ document_ID = {} index = {} for i in range(0, len(files)): document_ID[files[i]] = i terms = [] for file in files: terms = get_text(file) terms = words(terms) for term in terms: if index.__contains__(term) == True: index[term].add(document_ID[file]) else: index[term] = {document_ID[file]} return index
def getCategories (self, filename): categories = [] categoriesFile = open(filename+'categories.txt','r') for line in categoriesFile: categories.append (line) for category in categories: if '\n' in category: category = category[:len(category)-1] cFile = open (filename+category, 'r') cFileList = [] for line in cFile: if '\n' in line: line = line[:len(line)-1] if '\r' in line: line = line[:len(line)-1] cFileList.append (line) if category == 'stopwords.txt': self.words = words (cFileList) categories = {} categories[category.split('.')[0]] = cFileList categories['stopwords'] = self.words.listToDict(categories['stopwords']) # self.categoryOccurrences[category.split('.')[0]] = 0 return categories
def test_word_occurance8(self): self.assertDictEqual({ 'hello': 1, 'world': 1 }, words('hello\nworld'), msg='should not count multilines')
def test_word_occurance9(self): self.assertDictEqual({ 'hello': 1, 'world': 1 }, words('hello\tworld'), msg='should not count tabs')
def test_word_occurance0(self): self.assertDictEqual({ 'hello': 1, 'world': 1 }, words('hello world'), msg='should count multiple spaces as one')
def create_index(files): """ Given a list of fully-qualified filenames, build an index from word to set of document IDs. A document ID is just the index into the files parameter (indexed from 0) to get the file name. Make sure that you are mapping a word to a set of doc IDs, not a list. For each word w in file i, add i to the set of document IDs containing w Return a dict object mapping a word to a set of doc IDs. """ dct_index = defaultdict() # Create an empty dict for idx, fname in enumerate( files): # Iterate through every given file names s_content = get_text( fname) # Turn each file name into a string content lst_word = words( s_content ) # Turn the string content into a list of normalized words for word in lst_word: # For each normalized words, update the dict by word-file as key-value pairs if word not in dct_index: dct_index[word] = {idx} # If the key doesn't exist, create one else: dct_index[word].add( idx ) # If the key exist, add the file name into the set of the file names under that word return dct_index
def myhtable_create_index(files): """ Build an index from word to set of document indexes This does the exact same thing as create_index() except that it uses your htable. As a number of htable buckets, use 4011. Returns a list-of-buckets hashtable representation. """ # dct_index = defaultdict() # Create an empty dict # for file in files: # Iterate through every given file names # s_content = get_text(file) # Turn each file name into a string content # lst_word = words(s_content) # Turn the string content into a list of normalized words # for word in lst_word: # For each normalized words, update the dict by word-file as key-value pairs # if word not in dct_index: # dct_index[word] = {file} # If the key doesn't exist, create one # else: # dct_index[word].add(file) # If the key exist, add the file name into the set of the file names under that word # return dct_index NBUCKETS = 4011 table = htable(NBUCKETS) # Create an empty dict for idx,fname in enumerate(files): # Iterate through every given file names s_content = get_text(fname) # Turn each file name into a string content lst_word = words(s_content) # Turn the string content into a list of normalized words for word in lst_word: # For each normalized words, update the dict by word-file as key-value pairs set_IDs = htable_get(table, word) if set_IDs == None: htable_put(table, word, {idx}) # index or file name else: set_IDs.add(idx) return table
def test_word_occurance2(self): self.assertDictEqual({ 'one': 1, 'of': 1, 'each': 1 }, words("one of each"), msg='should count one of each')
def test_word_occurance5(self): self.assertDictEqual({ 'testing': 2, 1: 1, 2: 1 }, words('testing 1 2 testing'), msg='should include numbers')
def test_word_occurance6(self): self.assertDictEqual({ 'go': 1, 'Go': 1, 'GO': 1 }, words('go Go GO'), msg='should respect case')
def test_01(self): """String vacia""" test_string = "" for word in words(test_string): self.result.append(word) self.assertEquals(len(self.results), 0, "Falla: " + self.__doc__)
def linear_search(files, terms): returnFiles = [] searchTerms = set(terms) for item in files: fileWords = set(words(get_text(item))) if(searchTerms < fileWords): returnFiles.append(item) return returnFiles
def test_word_occurance7(self): self.assertDictEqual( { "¡Hola!": 1, "¿Qué": 1, "tal?": 1, "Привет!": 1 }, words('¡Hola! ¿Qué tal? Привет!'), msg='should count international characters properly')
def myhtable_create_index(files): wordBook = htable(4011) fileIndex = 0 for item in files: fileWords = set(words(get_text(item))) for word in fileWords: htable_put(wordBook,word, fileIndex) fileIndex += 1 return wordBook
def test_linear_berlitz(): terms = "hawaii travel" files = filelist(rootdir) terms = words(terms) linear_docs = linear_search(files, terms) expected = ['HistoryHawaii.txt'] assert filenames(linear_docs) == expected
def savedoc(suborgid,orgid,sourceurl,documentdate,name,dochash,pdftext,tokens,orphaned): scrapedate = time.strftime('%Y-%m-%d') doc = documents() docid = doc.add(suborgid,orgid,sourceurl,documentdate,scrapedate,name,dochash,orphaned) doct = documenttexts() doct.add(docid,pdftext) wrds = words() for token,frequency in tokens.items(): if len(token) > 3: wrds.add(docid,suborgid,orgid,token,frequency) return docid
def create_index(files): d = {} for k, file in enumerate(files): # loop through files wordsInDoc = words(get_text(file)) for word in wordsInDoc: # loop through words in that file if word not in d: d[word] = {files[k]} else: d[word].add(files[k]) # print("word {:d} ({:<14s}), doc {:d}".format(i+1, word, k)) # warning: x6 runtime! return d
def test_linear_berlitz(): terms = "hawaii travel" files = filelist(rootdir) terms = words(terms) linear_docs = linear_search(files, terms) expected = ['HistoryHawaii.txt'] assert filenames(linear_docs)==expected
def test_linear_berlitz_none(): terms = "missspellinnng" files = filelist(rootdir) terms = words(terms) linear_docs = linear_search(files, terms) expected = [] assert filenames(linear_docs)==expected
def test_word_occurance3(self): self.assertDictEqual( { 'one': 1, 'fish': 4, 'two': 1, 'red': 1, 'blue': 1 }, words("one fish two fish red fish blue fish"), msg='should count multiple occurrences')
def test_linear_berlitz_none(): terms = "missspellinnng" files = filelist(rootdir) terms = words(terms) linear_docs = linear_search(files, terms) expected = [] assert filenames(linear_docs) == expected
def test_myhtable_berlitz(): terms = "hawaii travel" files = filelist(rootdir) terms = words(terms) index = myhtable_create_index(files) myhtable_docs = myhtable_index_search(files, index, terms) expected = ['HistoryHawaii.txt'] assert filenames(myhtable_docs) == expected
def test_myhtable_berlitz_none(): terms = "missspellinnng" files = filelist(rootdir) terms = words(terms) index = myhtable_create_index(files) myhtable_docs = myhtable_index_search(files, index, terms) expected = [] assert filenames(myhtable_docs) == expected
def test_word_occurance4(self): self.assertDictEqual( { 'car': 1, ":": 2, 'carpet': 1, 'as': 1, 'java': 1, 'javascript!!&@$%^&': 1 }, words('car : carpet as java : javascript!!&@$%^&'), msg='should include punctuation')
def demo(model, user_input=None, **params): batch_size = params['batch_size'] # Data for demo prediction X, Y = get_batch(**params) if user_input: X[1][0] = words.indices(user_input) preds = model.predict(X) print("Target (left) vs. Network Output (right):") input_pixels, input_words = X[0][0], X[1][0] print(words.words(input_words)) left = input_pixels + Y[0] * 255. right = input_pixels + map_to_img(preds[0], **params) imutil.show(np.concatenate((left, right), axis=1))
def linear_search(files, terms): """ Given a list of fully-qualified filenames, return a list of them whose file contents has all words in terms as normalized by your words() function. Parameter terms is a list of strings. Perform a linear search, looking at each file one after the other. """ final_list = [] set_terms = set(terms) for article in files: data = get_text(articles) new_data = set(words(data)) if set_terms.issubset(new_data): final_list.append(articles) return final_list
def linear_search(files, terms): """ Given a list of fully-qualified filenames, return a list of them whose file contents has all words in terms as normalized by your words() function. Parameter terms is a list of strings. Perform a linear search, looking at each file one after the other. """ result = [] for file in files: contents = get_text(file) contents = words(contents) terms = pd.Series(terms) if all(terms.isin(contents)) == True: result.append(file) return result
def start_game(player_list: list, phrase: str): wheel_img_folder_path = os.path.join(__location__, const.IMG_FOLDER, const.IMG_BASE_NAME) #Call game constructor and pass the phrase for game instance variable ready = game(phrase) #Load up wheel image w = wheel(wheel_img_folder_path) #Shows wheel image and pointer w.start() #Words constructor wor = words(phrase) #Print out blank words wor.start(w) #Call start_wof from game object ready.start_wof(player_list, w, wor)
def myhtable_create_index(files): """ Build an index from word to set of document indexes This does the exact same thing as create_index() except that it uses your htable. As a number of htable buckets, use 4011. Returns a list-of-buckets hashtable representation. """ nbuckets = 4011 table = htable(nbuckets) for value in range(0, len(files)): terms = get_text(files[value]) terms = words(terms) for key in terms: table = htable_put(table, key, {value}) return table
def predict(model, x_global, x_local, x_ctx, box, **params): max_words = params['max_words'] # An entire batch must be run at once, but we only use the first slot in that batch indices = util.left_pad([words.START_TOKEN_IDX], **params) x_global = util.expand(x_global, 1) x_local = util.expand(x_local, 1) indices = util.expand(indices, 1) x_ctx = util.expand(x_ctx, 1) # Input is empty padding followed by start token output_words = [] for i in range(1, max_words): preds = model.predict([x_global, x_local, indices, x_ctx]) indices = np.roll(indices, -1, axis=1) indices[:, -1] = np.argmax(preds[:], axis=1) return words.words(indices[0])
def linear_search(files, terms): """ Given a list of fully-qualified filenames, return a list of them whose file contents has all words in terms as normalized by your words() function. Parameter terms is a list of strings. Perform a linear search, looking at each file one after the other. """ # path = "~/data/slate" # for path_name, subdir, f_name in os.walk(path): # for f in files: # if f in files: # path # p = os.path.join(path_name, f_name) # Can't use "os." # s = get_text(f) # print(s) all_in = True lst_qualified = [] for idx, file in enumerate(files): # print(idx, f) # s = get_text(f) # Use the existing function words() # for term in terms: # if term not in s: # Check if all the terms are contained in the file # all_in = False # if all_in == True: # Then this file is fully-qualified # lst_qualified.append(file) # print("!!!!!!") # print(idx, f) # with open(file) as f: # lst = f.readlines() # for line in lst: # for term in terms: # if term not in : # Check if all the terms are contained in the file # all_in = False # if all_in == True: # Then this file is fully-qualified # lst_qualified.append(file) # print("!!!!!!") if set(terms) == set(words(get_text(file))).intersection(set(terms)): lst_qualified.append(file) # print("!!!!!!") return lst_qualified
def linear_search(files, terms): """ Given a list of fully-qualified filenames, return a list of them whose file contents has all words in terms as normalized by your words() function. Parameter terms is a list of strings. Perform a linear search, looking at each file one after the other. """ listOfFiles = [] for file in files: # convert to list of words allWordsInFile = words(get_text(file)) # check to see if the search terms are subsets of the file words if set(terms).issubset(allWordsInFile): listOfFiles.append(file) return (listOfFiles)
def create_index(files): """ Given a list of fully-qualified filenames, build an index from word to set of document IDs. A document ID is just the index into the files parameter (indexed from 0) to get the file name. Make sure that you are mapping a word to a set of doc IDs, not a list. For each word w in file i, add i to the set of document IDs containing w Return a dict object mapping a word to a set of doc IDs. """ if len(files) <= 0: return None index = defaultdict(set) for i in range(len(files)): file_content = get_text(files[i]) key_words = words(file_content) for word in key_words: index[word].add(i) return index
def myhtable_create_index(files): """ Build an index from word to set of document indexes This does the exact same thing as create_index() except that it uses your htable. As a number of htable buckets, use 4011. Returns a list-of-buckets hashtable representation. """ wordlist = [words(get_text(files[i])) for i in range(len(files))] table = htable(4011) for i in range(len(files)): for j in range(len(wordlist[i])): htable_put(table, wordlist[i][j], set()) for i in range(len(files)): for j in range(len(wordlist[i])): htable_get(table, wordlist[i][j]).add(i) return table
def main(): pygame.init(); screen = pygame.display.set_mode((1024,768)); font = pygame.font.Font("./font/crayon.ttf", 70); drawtony(screen); w = words.words(); w.init(); while(1): input = raw_input(); input = unicode(input, "utf-8"); #print(input); if input == u"exit": end(screen); if input == u"さようなら": end(screen); result = w.check(input); #print(result); if result != -1: ny = font.render(w.ans[result] + u"ニー!", True, (255,255,255)); screen.blit(ny, (70, 524 + 188 / 2 - 10)); pygame.display.update(); pygame.time.wait(2000); drawtony(screen); continue; say = font.render(input + u"ってなあニー?", True, (255,255,255)); screen.blit(say, (70, 524 + 188 / 2 - 10)); pygame.display.update(); input2 = raw_input(); input2 = unicode(input2, "utf-8"); teach = font.render(input2 + u"かあ、ありがトニー!", True, (255,255,255)); drawtony(screen); screen.blit(teach, (70, 524 + 188 / 2 - 10)); den = pygame.image.load("./image/den.png").convert(); screen.blit(den, (455, 100)); w.input(input, input2); pygame.display.update(); pygame.time.wait(2000); drawtony(screen);
def main(): tweets = [] # read from db reader = streamReader() trends = reader.getTrends() # feed contents with each tag into words to form dictionary for tr in trends: data = reader.getTweets(tr) learner = words() learnt_sentence = 0 for text in data: learner.add_sentence(text) learnt_sentence += 1 if learnt_sentence == 0: continue tweets.append(learner.gen_sentence(140)) print learner.gen_sentence(140) # generate the sentence within the length limit of tweets # output method if ONLINE: post_tweets(tweets)
$ python search.py myhtable ~/data/slate """ impl = sys.argv[1] rootdir = sys.argv[2] files = filelist(rootdir) # Uncomment the next line to test just the first 100 files instead of all files # files = files[:100] N = len(files) print(N, "files") index = None while True: terms = input("Search terms: ") terms = words(terms) if impl=='linear': docs = linear_search(files, terms) elif impl == 'index': if index is None: index = create_index(files) print("Index complete") docs = index_search(files, index, terms) elif impl == 'myhtable': if index is None: index = myhtable_create_index(files) print("Index complete") docs = myhtable_index_search(files, index, terms) else: print("Invalid search type:", impl)
import sys import pickle from words import words from quote import * from markov import * from frequency import probabilities, make_word_walker quote_length = int(sys.argv[1]) algorithm_selection = sys.argv[2] files = sys.argv[3:len(sys.argv)] prefix_length = 2 if len(files) == 1 and files[0].split('.')[-1] == "pickle": source = pickle.load(open(files[0], "rb")) if algorithm_selection == 'markov': walker = make_prefix_walker(source) elif algorithm_selection == 'freq': walker = make_word_walker(source) else: corpus = words(files) if algorithm_selection == 'markov': source = markov(corpus, prefix_length) walker = make_prefix_walker(source) elif algorithm_selection == 'freq': source = probabilities(corpus) walker = make_word_walker(source) print( quote(quote_length, walker) )
def savewords(tokens,articleid,sourceid,thedate): wrds = words() for token,frequency in tokens.items(): if len(token) > 4: wrds.add(token,frequency,articleid,sourceid,thedate)
# -*- coding: UTF-8 -*- from words import words from flask import Flask,render_template, session, redirect, url_for, flash from flask.ext.bootstrap import Bootstrap from forms.validateForm import validateForm import sys reload(sys) sys.setdefaultencoding('utf-8') app = Flask(__name__) bootstrap = Bootstrap(app) app.config['SECRET_KEY'] = 'zhaoyuan work' WORDS = words() WORDS.get_words_from_db('select * from word') @app.route('/') def hello_world(): return 'Hello world' @app.route('/play',methods=['get','post']) def test_validate(): form = validateForm() if form.validate_on_submit(): # The first one of the data This is only test words = form.words.data meaning = form.meaning.data if not WORDS.match_words(words,meaning): flash('哦吼,%s的意思没有输入对哦' % words) session['words'] = words