def calculateKeywords(self): self.keywords = WordList() self.avoids = WordList() self.wordAvg = 0 self.avgRatio = 0 usef = 0 usel = 0 for data in self.wl.words: self.wordAvg += data.occ usef += data.usef usel += data.usel l = len(self.wl.words) if l == 0: l = 1 self.wordAvg = self.wordAvg / l if usel == 0: self.avgRatio = usef else: self.avgRatio = usef / usel for data in self.wl.words: if data.usel == 0: ratio = data.usef else: ratio = data.usef / data.usel if ratio > self.avgRatio * self.ratioDiff: self.keywords.set(data.word, data.occ, data.usef, data.usel) elif ratio < self.avgRatio / self.ratioDiff: self.avoids.set(data.word, data.occ, data.usef, data.usel)
def getWords(self, tree): wl = WordList() for data in tree.findall('words/word'): word = data.text occ = int(data.get('occured')) usf = int(data.get('useful')) usl = int(data.get('useless')) wl.set(word,occ,usf,usl) return wl
def correct_words(classified_words, actual_words, dictionary_word_list, names_word_list, mode='single'): corrected_words = classified_words[:] for word_index in xrange(len(corrected_words)): current_word = corrected_words[word_index] lowercase_word = corrected_words[word_index].lower() # Don't carry out correction for valid single letters valid_single_letters = 'ai' if not lowercase_word in valid_single_letters: # Preserve capitalisation of first letter of each word capitalised = current_word[0].isupper() if not dictionary_word_list.word_match(lowercase_word) and not names_word_list.word_match(lowercase_word): name_word = names_word_list.correct_word(lowercase_word, isCapitalised=capitalised, mode=mode) dictionary_word = dictionary_word_list.correct_word(lowercase_word, isCapitalised=capitalised, mode=mode) # Skip loop if no valid corrections were found if name_word is None and dictionary_word is None: continue # If no name words were found, use the dictionary word elif name_word is None and dictionary_word is not None: current_word = dictionary_word # If no dictionary words were found, use the name word elif name_word is not None and dictionary_word is None: current_word = name_word # See if word is closer to a name or a dictionary word and correct it to the closer one elif WordList.word_difference(current_word, name_word) < dictionary_word_list.word_difference(current_word, dictionary_word): current_word = name_word else: current_word = dictionary_word corrected_words[word_index] = current_word return corrected_words
def test_solve(): solver = Solver() wordlist = WordList().words dealt = 'JREZQXW' expected = [ 'ER', 'EX', 'JEW', 'RE', 'REW', 'REX', 'REZ', 'WE', 'WEX', 'ZEX' ] assert solver.solve(dealt) == expected
def test_init(self): char_mat = CharMat() word_list = WordList() game_status = GameStatus(char_mat, word_list) self.assertEqual(game_status.word_list, word_list) self.assertEqual(game_status.char_mat, char_mat) self.assertEqual(game_status.score, 0) self.assertEqual(game_status.time, 60) self.assertEqual(game_status.times_up, False) self.assertEqual(game_status.success, False)
def __init__(self, grid, word_list_filename, print_func=print): """Inits WordSearch from an grid of letters and from input word-list text file. Args: grid (list) - letters grid as list of n m-letters str word_list_filename (str) - full path input word-list text file. print_func (func(x)) - [optional] print word function """ self.set_grid(grid) self._word_list = WordList(word_list_filename) self._print_func = print_func self._no_found_words = 0
def create_word_list(file_name): """Creates WordList object from input word-list text file. Args: file_name (str) - full path input word-list UTF-8 text file each word in single line. Returns: The created WordList object """ print("creating word-list from " + str(file_name) + " file... ") word_list = WordList("word.list") print("[DONE]") return word_list
def storeWords(self): self.wl = WordList() xReader = XMLReader() xParser = XMLParser() if xReader.checkIfExistsQuiet('xml/words.xml'): tree = xReader.getTree('xml/words.xml') wordAvg, avgRatio = xParser.getGeneralFromWords(tree) self.wl = xParser.getWords(tree) usf = 0 usl = 0 if self.vote == "up": usf = 1 else: usl = 1 for ind, obj in enumerate(self.XMLInspections): if obj.ID != self.voteId: continue pl = PageLoader(obj.fil) if not pl.isReadable(): print('Abort. File not readable:', obj.fil) exit() pl.read() patt = "^[a-zA-Z0-9]*$" pl.linkWords = self.removeListElesNotPatterned(patt, pl.linkWords) pl.titleWords = self.removeListElesNotPatterned(patt, pl.titleWords) pl.headerWords = self.removeListElesNotPatterned(patt, pl.headerWords) pl.specialWords = self.removeListElesNotPatterned(patt, pl.specialWords) pl.normalWords = self.removeListElesNotPatterned(patt, pl.normalWords) for word in pl.linkWords: self.wl.append(word, usf, usl) for word in pl.titleWords: self.wl.append(word, usf, usl) for word in pl.headerWords: self.wl.append(word, usf, usl) for word in pl.specialWords: self.wl.append(word, usf, usl) for word in pl.normalWords: self.wl.append(word, usf, usl) return
#!/usr/bin/env python from word_list import WordList __author__ = 'kreitzem' # rack = sys.argv[1].upper() wl = WordList() word = "WASTE" rack = "THAT".upper() word_num = 0 rack_num = 0 for num in range(0, len(word)-1, 1): temp_word = word[word_num] + rack[rack_num] if wl.is_a_word(temp_word): print "we got %s" % temp_word word_num = + 1 rack_num = + 1 next else: print("not a word %s" % temp_word ) word_num = + 1 next
def __init__(self, anagram_dictionary=WordList().all): self.anagram_dictionary = anagram_dictionary
def setUpClass(cls): cls.word_list = WordList('boggle_app/word_lists/en.txt')
def stat_maker(): word_list = WordList().words return StatMaker(word_list)
def toUpper(self, name, prefix = '', suffix = ''): wl = WordList() wl.fromSnakeCase(prefix + self.data[name] + suffix) return wl.toUCase()
def test_check_word(): # create all game like variable char_mat = CharMat() char_mat.set_word("hello", (4, 4), (0, 0)) word_list = WordList() game_status = GameStatus(char_mat, word_list)
class MainUpdater(): """docstring for MainUpdater""" vote = None voteId = None miPath = None XMLInspections = [] wl = None wordAvg = None avgRatio = None ratioDiff = 2 keywords = None avoids = None def __init__(self, vote, voteId, miPath): self.vote = vote self.voteId = voteId self.miPath = miPath self.wordXMLPath = "xml/words.xml" def loadMasterInspection(self): insp = Inspector() self.XMLInspections = insp.getInspectionsStr(self.miPath) if len(self.XMLInspections) == 0: print('Abort. No data found in', self.miPath) exit() def storeWords(self): self.wl = WordList() xReader = XMLReader() xParser = XMLParser() if xReader.checkIfExistsQuiet('xml/words.xml'): tree = xReader.getTree('xml/words.xml') wordAvg, avgRatio = xParser.getGeneralFromWords(tree) self.wl = xParser.getWords(tree) usf = 0 usl = 0 if self.vote == "up": usf = 1 else: usl = 1 for ind, obj in enumerate(self.XMLInspections): if obj.ID != self.voteId: continue pl = PageLoader(obj.fil) if not pl.isReadable(): print('Abort. File not readable:', obj.fil) exit() pl.read() patt = "^[a-zA-Z0-9]*$" pl.linkWords = self.removeListElesNotPatterned(patt, pl.linkWords) pl.titleWords = self.removeListElesNotPatterned(patt, pl.titleWords) pl.headerWords = self.removeListElesNotPatterned(patt, pl.headerWords) pl.specialWords = self.removeListElesNotPatterned(patt, pl.specialWords) pl.normalWords = self.removeListElesNotPatterned(patt, pl.normalWords) for word in pl.linkWords: self.wl.append(word, usf, usl) for word in pl.titleWords: self.wl.append(word, usf, usl) for word in pl.headerWords: self.wl.append(word, usf, usl) for word in pl.specialWords: self.wl.append(word, usf, usl) for word in pl.normalWords: self.wl.append(word, usf, usl) return def removeListElesNotPatterned(self, patt, li, maxLen = 255): indList = [] for i in range(len(li)): if re.match(patt, li[i]) == None or len(li[i]) > maxLen: indList.append(i) for i in reversed(range(len(indList))): li.pop(indList[i]) return li def deleteFile(self): for ind, obj in enumerate(self.XMLInspections): if obj.ID == self.voteId: os = OSTool() os.deleteFile(obj.fil) return def writeWordsXML(self): xWriter = XMLWriter() xWriter.writeWordXML(self.wl,self.wordAvg,self.avgRatio,self.wordXMLPath) def calculateKeywords(self): self.keywords = WordList() self.avoids = WordList() self.wordAvg = 0 self.avgRatio = 0 usef = 0 usel = 0 for data in self.wl.words: self.wordAvg += data.occ usef += data.usef usel += data.usel l = len(self.wl.words) if l == 0: l = 1 self.wordAvg = self.wordAvg / l if usel == 0: self.avgRatio = usef else: self.avgRatio = usef / usel for data in self.wl.words: if data.usel == 0: ratio = data.usef else: ratio = data.usef / data.usel if ratio > self.avgRatio * self.ratioDiff: self.keywords.set(data.word, data.occ, data.usef, data.usel) elif ratio < self.avgRatio / self.ratioDiff: self.avoids.set(data.word, data.occ, data.usef, data.usel) def updateKeywordsXML(self): xWriter = XMLWriter() xWriter.writeKeywordsXML(self.keywords, self.avoids, 'xml/keywords.xml') def updateSitesXMl(self): xReader = XMLReader() xParser = XMLParser() xWriter = XMLWriter() tree = xReader.getTree('xml/sites.xml') gdSites, bdSites = xParser.getSites(tree) data = None for obj in self.XMLInspections: if obj.ID == self.voteId: data = obj break if self.vote == "up": gdSites.append(obj.url) else: bdSites.append(obj.url) xWriter.writeSitesXML(gdSites,bdSites,'xml/sites.xml') def getXMLInspScored(self): p = PageToXML(self.XMLInspections, self.keywords, self.avoids) self.XMLInspections = p.getScore() return self.XMLInspections
ap.add_argument('-depth', required=False, type=int, help="How deep should we descend into the structure.") ap.add_argument('-bruteforce', dest='bruteforce', action='store_true', help="Do you want to use bruteforce?") ap.set_defaults(threads=20, depth=2, bruteforce=False) args = vars(ap.parse_args()) url = args['url'] if args['bruteforce']: word_generator = BruteForceGenerator().generator() elif args['words']: word_generator = WordList(args['words']).word_list else: print( "Please set the bruteforce flag or specify a path to a text file containing words." ) exit(0) dircrawl_manager = DircrawlManager(word_generator, url, args['threads'], args['depth']) dircrawl_manager.run() list_for_tabulate = [[request.url, request.http_code] for request in dircrawl_manager.results] print( tabulate(list_for_tabulate, headers=['URL', 'HTTP CODE'], tablefmt='grid'))
def preprocess(data_path, is_testing, min_occurrences=5, cache_bow_output=None, cache_word2vec_output=None, duration=None): if duration: data = DataInitializer() data.initialize(data_path, is_testing, duration=duration) else: data = DataInitializer() data.initialize(data_path, is_testing) if os.path.isfile("data/BTC.csv"): prices_data = GetPricesData() prices_data.main() data = DataCleaning(data, is_testing) data.cleanup(DataCleaner(is_testing)) if is_testing: print("Testing data shape:", data.processed_data.shape) else: print("Training data shape:", data.processed_data.shape) data = Sentiments(data) data.sentiment_analysis_by_text() print("First five rows with sentiment: ", data.processed_data.head()) if is_testing: data.processed_data.to_csv("data/clean_test_with_sentiments.csv", sep=',', encoding='utf-8', index=False) # os.remove(data_path) else: data.processed_data.to_csv("data/clean_train_with_sentiments.csv", sep=',', encoding='utf-8', index=False) # os.remove(data_path) data = DataTokenize(data) data.tokenize() data.stem() data = WordList(data) data.build_wordlist(min_occurrences=min_occurrences) word2vec_data = data data = BagOfWords(data.processed_data, data.wordlist, is_testing) data.build_data_model() print("data model head: ", data.data_model.head(5)) """ Word 2 vec """ word2vec = Word2VecProvider() # REPLACE PATH TO THE FILE word2vec.load("../twitter/data/glove.twitter.27B.200d.txt") word2vec_data = RedditData(word2vec_data) word2vec_data.build_final_model(word2vec) word2vec_data_model = word2vec_data.data_model if "index" in word2vec_data_model.columns: word2vec_data_model.drop("index", axis=1, inplace=True) word2vec_data_model.dropna(axis=0, inplace=True) word2vec_data_model.reset_index(inplace=True) word2vec_data_model.index = word2vec_data_model['timestamp_ms'] print("final word2vec data model: \n", word2vec_data_model.head(), "\n") """ Tokenizing the data """ texts = [] sentiments = [] tokenized_data = pd.DataFrame() for text in data.processed_data["summary"]: texts.append(text) for sentiment in data.processed_data["sentiment"]: sentiments.append(sentiment) print("texts: ", texts[0:5]) tokenizer = Tokenizer(num_words=20000) tokenizer.fit_on_texts(texts) sequences = tokenizer.texts_to_sequences(texts) padded_sequences = pad_sequences(sequences, maxlen=200) print( "\n\n##################################################\npadded sequence head: \n", padded_sequences[0:5]) print( "\n####################################################\n padded sequence length \n", len(padded_sequences)) if not is_testing: data = Plotting(data) data.plot() if cache_bow_output is not None: data.data_model.to_csv(cache_bow_output, index=False, float_format="%.6f") word2vec_data_model.to_csv(cache_word2vec_output, index=False, float_format="%.6f") with open('sequences', 'wb') as fp: pickle.dump(padded_sequences, fp) with open('sentiments', 'wb') as fp: pickle.dump(sentiments, fp) return data.data_model, word2vec_data_model
def preprocess(data_path, is_testing, min_occurrences=5, cache_bow_output=None, cache_word2vec_output=None, duration=None, sentiment_method=None): if duration and cache_bow_output and cache_word2vec_output: data = DataInitializer() data.initialize(data_path, is_testing, duration=duration) elif cache_bow_output and cache_word2vec_output: data = DataInitializer() data.initialize(data_path, is_testing, cache_bow_output=cache_bow_output, cache_word2vec_output=cache_word2vec_output) else: data = DataInitializer() data.initialize(data_path, is_testing) if not os.path.isfile("data/Train_BTC.csv"): prices_data = GetPricesData() prices_data.main() if not os.path.isfile("data/Test_BTC.csv"): prices_data = GetPricesData() prices_data.main() data = DataCleaning(data, is_testing) data.cleanup(DataCleaner(is_testing)) if is_testing: print("Testing data shape:", data.processed_data.shape) else: print("Training data shape:", data.processed_data.shape) data = Sentiments(data, sentiment_method=sentiment_method) data.sentiment_analysis_by_text() print("First five rows with sentiment: ", data.processed_data.head()) if is_testing: data.processed_data.to_csv( "data/one_month_clean_test_data_with_prices.csv", sep=',', encoding='utf-8', index=False) # os.remove(data_path) else: data.processed_data.to_csv("data/one_month_clean_data_with_prices.csv", sep=',', encoding='utf-8', index=False) # os.remove(data_path) if os.path.isfile(cache_word2vec_output): print("cache_word2vec_output file name: ", cache_word2vec_output) word2vec_data_model = pd.read_csv(cache_word2vec_output) data.data_model = pd.read_csv(cache_bow_output) print("data model head: ", data.data_model.head(5)) else: data = DataTokenize(data) data.tokenize() data.stem() data = WordList(data) data.build_wordlist(min_occurrences=min_occurrences) word2vec_data = data data = BagOfWords(data.processed_data, data.wordlist, is_testing) data.build_data_model() print("data model head: ", data.data_model.head(5)) """ Word 2 vec """ word2vec = Word2VecProvider() # REPLACE PATH TO THE FILE word2vec.load("data/glove.twitter.27B.200d-with2num.txt") word2vec_data = TwitterData(word2vec_data) word2vec_data.build_final_model(word2vec) word2vec_data_model = word2vec_data.data_model if "original_id" in word2vec_data_model.columns: word2vec_data_model.drop("original_id", axis=1, inplace=True) word2vec_data_model.dropna(axis=0, inplace=True) word2vec_data_model.reset_index(inplace=True, drop=True) word2vec_data_model.index = word2vec_data_model['timestamp'] print("final word2vec data model: \n", word2vec_data_model.head(), "\n") # if not is_testing: # data = Plotting(data) # data.plot() if not is_testing: if not os.path.isfile("train_sequences"): print("\n##########################\n" "Tokenizing the tweets\n" "############################\n") texts = [] sentiments = [] tokenized_data = pd.DataFrame() for text in data.processed_data["text"]: texts.append(text) for sentiment in data.processed_data['sentiment']: sentiments.append(sentiment) print("texts: ", texts[0:5]) tokenizer = Tokenizer() tokenizer.fit_on_texts(texts) sequences = tokenizer.texts_to_sequences(texts) padded_sequences = pad_sequences(sequences, maxlen=20, padding='post') padded_sequences = pd.DataFrame(data=padded_sequences) merged_train_data = pd.concat([ padded_sequences, data.processed_data[[ "high", "low", "open", "quoteVolume", "volume", "weightedAverage" ]] ], axis=1) train_targets = data.processed_data[["close"]] print("shape of merged train data: ", merged_train_data.shape) with open('data/train_sequences', 'wb') as fp: pickle.dump(merged_train_data, fp) with open('data/train_prices', 'wb') as fp: pickle.dump(train_targets, fp) # load the whole embedding into memory embeddings_index = dict() with open("data/glove.twitter.27B.200d-with2num.txt", "r", encoding="utf-8") as my_file: for line in my_file: values = line.split() word = values[0] coefs = numpy.asarray(values[1:], dtype='float32') embeddings_index[word] = coefs # f.close() print("*" * 80, "\n" * 10) print('Loaded %s train word vectors.' % len(embeddings_index)) print('Total %s of word indexes.' % len(tokenizer.word_index)) with open('data/embeddings_index', 'wb') as fp: pickle.dump(embeddings_index, fp) with open('data/train_word_indexes', 'wb') as fp: pickle.dump(tokenizer.word_index, fp) # encode class values as integers # encoder = LabelEncoder() # encoder.fit(sentiments) # encoded_sentiments = encoder.transform(sentiments) # convert integers to dummy variables (i.e. one hot encoded) # dummy_sentiments = np_utils.to_categorical(encoded_sentiments) # for text in data.processed_data.loc[data.processed_data['sentiment'] != 0, "text"]: # texts.append(text) # # for sentiment in data.processed_data.loc[data.processed_data['sentiment'] != 0, "sentiment"]: # sentiments.append(sentiment) else: if not os.path.isfile("test_sequences"): print("\n##########################\n" "Tokenizing the tweets\n" "############################\n") texts = [] sentiments = [] tokenized_data = pd.DataFrame() for text in data.processed_data["text"]: texts.append(text) for sentiment in data.processed_data['sentiment']: sentiments.append(sentiment) print("texts: ", texts[0:5]) tokenizer = Tokenizer() tokenizer.fit_on_texts(texts) sequences = tokenizer.texts_to_sequences(texts) padded_sequences = pad_sequences(sequences, maxlen=20, padding='post') padded_sequences = pd.DataFrame(data=padded_sequences) merged_test_data = pd.concat([ padded_sequences, data.processed_data[[ "high", "low", "open", "quoteVolume", "volume", "weightedAverage" ]] ], axis=1) test_targets = data.processed_data[["close"]] print("shape of merged test data: ", merged_test_data.shape) with open('data/test_sequences', 'wb') as fp: pickle.dump(merged_test_data, fp) with open('data/test_prices', 'wb') as fp: pickle.dump(test_targets, fp) with open('data/test_word_indexes', 'wb') as fp: pickle.dump(tokenizer.word_index, fp) # padded_sequences = pd.DataFrame(data=padded_sequences) print( "\n\n##################################################\npadded sequence head: \n", padded_sequences[0:5]) print( "\n####################################################\n padded sequence length \n", len(padded_sequences)) if not os.path.isfile(train_data_word2vec_file_name) or not os.path.isfile( test_data_word2vec_file_name): if cache_bow_output is not None: data.data_model.to_csv(cache_bow_output, index=False, float_format="%.6f") word2vec_data_model.to_csv(cache_word2vec_output, index=False, float_format="%.6f") return data.data_model, word2vec_data_model