def compute_p_word_given_class(data_paths, vocab_size): """ Return a dictionary of word probabilities, P(word | class). All datapaths belong to the same class. Incorporate Laplacian Smoothing with k=1 here. p_word_given_class should include the probability of UNKNOWN_WORD, any word that doesn't appear in the training set """ p_word_given_class = dict() # compute number of words in the given class class_size = 0 for path in data_paths: message = open_file(path) words = get_words(message) class_size += len(words) # add elements to dictionary for path in data_paths: message = open_file(path) words = get_words(message) for word in words: if word in p_word_given_class: p_word_given_class[word] += 1 / (class_size + vocab_size + 1) else: p_word_given_class[word] = 2 / (class_size + vocab_size + 1) p_word_given_class['UNKNOWN_WORD'] = 1 / (class_size + vocab_size + 1) return p_word_given_class
def __get_equals_words_atr(self, summary_sentence, document_sentence): ''' Return the proportions of equals words ''' words_summary = utils.get_words(summary_sentence, stop_words=True) words_document = utils.get_words(document_sentence, stop_words=True) total_words = len(words_summary) equals_words = 0 tmp_dict = dict() for word_summary in words_summary: for word_document in words_document: if word_summary == word_document and word_summary not in tmp_dict: tmp_dict[word_summary] = 1 equals_words += 1 return equals_words / total_words
def analyze_text(self, text): words = get_words(text) char_count = get_char_count(words) word_count = len(words) sentence_count = len(get_sentences(text)) syllable_count = count_syllables(words) complexwords_count = count_complex_words(text) avg_words_p_sentence = word_count / sentence_count self.analyzedVars = { 'words': words, 'char_cnt': float(char_count), 'word_cnt': float(word_count), 'sentence_cnt': float(sentence_count), 'syllable_cnt': float(syllable_count), 'complex_word_cnt': float(complexwords_count), 'avg_words_p_sentence': float(avg_words_p_sentence) } outData = { 'char_cnt': float(char_count), 'word_cnt': float(word_count), 'sentence_cnt': float(sentence_count), 'syllable_cnt': float(syllable_count), 'complex_word_cnt': float(complexwords_count), 'avg_words_p_sentence': float(avg_words_p_sentence) } return outData
def compute(test_id='test'): pic_path = 'static/' + test_id + '.png' truth_path = 'static/' + test_id + '.txt' base_image, pic_with_box, word_pics = get_words(pic_path) word_pics = np.stack(word_pics) pred = F.softmax(net_recog(torch.from_numpy(word_pics)), dim=1) _, idxes = torch.max(pred, dim=1) res = [] for idx in idxes: res.append(ug_dict[idx]) f = open(truth_path, 'r') lines = f.readlines() f.close() lines = [l.strip() for l in lines] truth = [] for l in lines: truth += l.split() for i, w in enumerate(res): if w not in truth: res[i] = '<span style="color:red">' + res[i] + '</span>' truth = ' '.join(truth) res = ' '.join(res[::-1]) score = difflib.SequenceMatcher(a=truth, b=res).ratio() return img2str(base_image), img2str(pic_with_box), res, truth, score
def p_word_given_class(self, data_files, vocab_size): """ helper function return dictionary representation of P(word | class) """ word_counter = Counter() for file in data_files: words = get_words(open_file(file)) for i, word in enumerate(words): if unigrams: if word not in self.vocab: continue word_counter[word] += 1 if bigrams: if i != 0: bigram = words[i - 1] + ' ' + word if bigram not in self.vocab: continue word_counter[bigram] += 1 word_counter["UNK"] = 0 total_count = sum(word_counter.values()) for word in word_counter: word_counter[word] = (word_counter[word] + self.k) / (total_count + vocab_size * self.k) p_word_given_class = dict(word_counter) return p_word_given_class
def construct_libsvm_line(line): global target_index, value_indexes, headers, indexes2binarize, indexes2tokenize, indexes2filter label = target_index new_line = [] for i in sorted(indexes2binarize + indexes2tokenize): col_name = headers[i] if i in indexes2binarize: value = line[i] try: value_index = value_indexes[i][value] except KeyError: continue new_item = "%s:1" % (value_index) new_line.append(new_item) else: text = line[i] words = get_words(text) # word_indexes = map( lambda x: value_indexes[i][x], words ) word_indexes = get_word_indexes(words, i) for word_index in sorted(word_indexes): new_item = "%s:1" % (word_index) new_line.append(new_item) new_line.insert(0, label) new_line = " ".join(new_line) return new_line
def p_class_given_input(self, input_path, p_word_given_class, p_class): """ helper function return P(class | input) """ p_class_given_input = 0 words = get_words(open_file(input_path)) if bigrams: sample_bigrams = set() for i, word in enumerate(words): if i != 0: sample_bigrams.add(words[i - 1] + ' ' + word) for bigram in sample_bigrams: if bigram not in self.vocab: continue if bigram not in p_word_given_class: bigram = "UNK" p = p_word_given_class[bigram] if p > 0: p_class_given_input += math.log(p_word_given_class[bigram]) if unigrams: for word in set(words): if word not in self.vocab: continue if word not in p_word_given_class: word = "UNK" p = p_word_given_class[word] if p > 0: p_class_given_input += math.log(p_word_given_class[word]) p_class_given_input += math.log(p_class) return p_class_given_input
def construct_libsvm_line( line ): global target_index, value_indexes, headers, indexes2binarize, indexes2tokenize, indexes2filter label = target_index new_line = [] for i in sorted( indexes2binarize + indexes2tokenize ): col_name = headers[i] if i in indexes2binarize: value = line[i] try: value_index = value_indexes[i][value] except KeyError: continue new_item = "%s:1" % ( value_index ) new_line.append( new_item ) else: text = line[i] words = get_words( text ) # word_indexes = map( lambda x: value_indexes[i][x], words ) word_indexes = get_word_indexes( words, i ) for word_index in sorted( word_indexes ): new_item = "%s:1" % ( word_index ) new_line.append( new_item ) new_line.insert( 0, label ) new_line = " ".join( new_line ) return new_line
def compute_p_word_given_class(data_paths, vocab_size): """ Return a dictionary of word probabilities, P(word | class). All datapaths belong to the same class. Incorporate Laplacian Smoothing with k=1 here. p_word_given_class should include the probability of UNKNOWN_WORD, any word that doesn't appear in the training set """ ########################################################### # Implement your solution here ########################################################### count_words = {} total_num_words = 0 for path in data_paths: words = get_words(open_file(path)) total_num_words += len(words) for word in words: if word in count_words: count_words[word] += 1 else: count_words[word] = 1 p_word_given_class = {} for word in count_words: prob = (count_words[word] + 1) / (total_num_words + vocab_size + 1) p_word_given_class[word] = prob #print(word + " ", end="") #print(p_word_given_class[word]) p_word_given_class["UNKNOWN_WORD"] = 1 / (total_num_words + vocab_size + 1) #print(p_word_given_class["UNKNOWN_WORD"]) return p_word_given_class
def get_cut_points( time_dict: Dict[datetime.datetime, List[str]], up_ratio: float = 2, down_ratio: float = 0.75, topK: int = 5 ) -> List[Tuple[datetime.datetime, datetime.datetime, List[str]]]: status = 0 cut_points = [] prev_num = None start_time = None temp_texts = [] for time, texts in time_dict.items(): if prev_num is None: start_time = time temp_texts = copy.copy(texts) elif status == 0 and len(texts) >= prev_num * up_ratio: status = 1 temp_texts.extend(texts) elif status == 1 and len(texts) < prev_num * down_ratio: tags = utils.get_words("。".join(texts), topK=topK) cut_points.append((start_time, time, tags)) status = 0 start_time = time temp_texts = copy.copy(texts) elif status == 0: start_time = time temp_texts = copy.copy(texts) prev_num = len(texts) return cut_points
def analyze_text(self, text): words = get_words(text) char_count = int(get_char_count(words)) word_count = int(len(words)) sentences = get_sentences(text) len_sentences = len(sentences) sentence_count = int(len_sentences) # sentence_count = int(len(get_sentences(text))) syllable_count = count_syllables(words) complexwords_count = count_complex_words(text.decode('utf-8')) avg_words_p_sentence = word_count / sentence_count encoding_dict = detect_encoding(self.filename) self.analyzedVars = { 'filename': self.filename, # 'text_truncated': text[:200].replace("\n", " "), 'words': words, 'char_cnt': float(char_count), 'word_cnt': float(word_count), 'sentence_cnt': float(sentence_count), 'syllable_cnt': float(syllable_count), 'complex_word_cnt': float(complexwords_count), 'avg_words_p_sentence': float(avg_words_p_sentence), 'encoding': encoding_dict['encoding'], 'encoding_confidence': encoding_dict['confidence'] }
def SPIMI(docs, details=False): ans = {} cur = 0 tmp = [] for doc in docs: cnt = 0 with open(doc, 'r') as f: s = f.read() stop_list = gen_default_stop(s) words = get_words(s, stop_list) indexing = invert_index(words, delta=False) for key, value in indexing.items(): if key not in ans: ans[key] = list(np.array(value) + cur) else: ans[key] += list(np.array(value) + cur) cnt += len(value) tmp.append(cnt) cur += cnt for key, value in ans.items(): for i in range(1, len(value)): value[i] -= value[i - 1] if details: res = {'词项数目': len(ans), '文档数量': len(docs), \ '词条数量': cur, '文档长度': tmp, '文档平均长度': np.average(tmp)} return ans, res return ans
def __get_words_summary(self, summary_text): ''' Gets unique words of the summary ''' self.__words_summary = utils.get_words(summary_text) for word in self.__words_summary: if word in self.__unique_words_summary: self.__unique_words_summary[word] += 1 else: self.__unique_words_summary[word] = 1
def __init__(self, fold): self.fold = fold #print('fold {}'.format(fold)) #print('getting data paths') self.X_train_pos, self.X_test_pos = get_data_paths( data_positive, test_data_start, numfolds, self.fold) self.X_train_neg, self.X_test_neg = get_data_paths( data_negative, test_data_start, numfolds, self.fold) #print('building vocabulary') self.vocab = Counter() # filtering words under threshold words_to_delete = set() ft = 0 if bigrams: # print('generating vocabulary for bigrams') for path in self.X_train_pos + self.X_train_neg: message = open_file(path) words = get_words(message) for i, word in enumerate(words): if i != 0: bigram = words[i - 1] + ' ' + word self.vocab[bigram] += 1 ft = frequency_cutoff_bigram for word in self.vocab: if self.vocab[word] < ft: words_to_delete.add(word) if unigrams: # print('generating vocabulary for unigrams') for path in self.X_train_pos + self.X_train_neg: message = open_file(path) words = get_words(message) for word in words: self.vocab[word] += 1 ft = frequency_cutoff_unigram for word in self.vocab: if self.vocab[word] < ft: words_to_delete.add(word) for word in words_to_delete: del (self.vocab[word]) #print('initializing rest of variables') self.vocab_size = len(set(self.vocab)) self.p_word_given_pos = dict() self.p_word_given_neg = dict() self.p_pos = 0 self.p_neg = 0 self.k = k
def __get_keywords_positions(self, text_document): word_list = utils.get_words(text_document) position_list = dict() for i in range(len(word_list)): if word_list[i] in self.__keywords_list: if not word_list[i] in position_list: position_list[word_list[i]] = list() position_list[word_list[i]].append(i) return position_list
def relable_transformer(val, keywords, no_hit_to_null=True): val_clean = ' '.join(get_words(val)) for keyword, repl in keywords: if keyword in val_clean: return repl if no_hit_to_null: return na_value else: return val
def __frequency_word(self, text_document): word_list = utils.get_words(text_document) frequency_word = dict() for word in word_list: if word in frequency_word: frequency_word[word] += 1 else: frequency_word[word] = 1 return frequency_word
def create_position_list(self): ''' Obtains the positions in the document of the words summary: 'S1_W1' S1 = position of the sentence, W1 = position of word in the sentences ''' for i in range(len(self.__document_sentence_list)): words = utils.get_words(self.__document_sentence_list[i]) for j in range(len(words)): if words[j] in self.__unique_words_summary: if not words[j] in self.__position_word_list: self.__position_word_list[words[j]] = list() self.__position_word_list[words[j]].append(str(i+1)+'_'+str(j+1))
def prepare_chn_data(args): settings = parse_settings(args.setting) try: chn_file = os.path.join(ENV.data_dir, "%s" % settings['chn']) combinations = os.path.join(ENV.data_dir, "combination_%s" % settings['chn']) words = utils.get_words(chn_file) for i in range(2,3): for expression in list(itertools.combinations(words, i)): utils.push_word_back(combinations, ''.join(expression)) except Exception as e: print e
def __get_sense_units_atr(self, summary_sentence, document_sentence): ''' Return the proportions of equals words''' words_summary = utils.get_words(summary_sentence, stop_words=True) words_document = utils.get_words(document_sentence, stop_words=True) #print("words_summary", words_summary) #print("words_sentence", words_sentence) unique_words = dict() cont = 0 tmp = list() for word_summary in words_summary: if not word_summary in unique_words: # to avoid repetitions unique_words[word_summary] = 1 for word_document in words_document: if word_summary in self.__tep_synonyms and word_document in self.__tep_synonyms[ word_summary]: #print("synonyms", word_summary, word_sentence) tmp.append((word_summary, word_document)) cont += 1 return cont #(cont, tmp)
def create_position_list(self): ''' Obtains the positions in the document of the words summary: 'S1_W1' S1 = position of the sentence, W1 = position of word in the sentences ''' for i in range(len(self.__document_sentence_list)): words = utils.get_words(self.__document_sentence_list[i]) for j in range(len(words)): if words[j] in self.__unique_words_summary: if not words[j] in self.__position_word_list: self.__position_word_list[words[j]] = list() self.__position_word_list[words[j]].append( str(i + 1) + '_' + str(j + 1))
def build_markov(self, file_name): previous = None self.word_list = utils.get_words(file_name) for word in self.word_list: if previous is None: previous = word continue if previous not in self: self[previous] = Dictogram() self[previous].add_count(word) previous = word
def count_file(filename): freqs = Counter() with open(filename, 'r') as file: chunk_reader = partial(_read_chunk_full_words, file) for chunk in iter(chunk_reader, ''): words = get_words(chunk) freqs.update(Counter(words)) update_data(freqs) return frequencies_string(freqs)
def start_classify(path, classifier): output_folder = prepare_output() words = utils.get_words(path) for word in words: #word = post_processing if classifier.classify(word) == 'words': utils.push_word_back(os.path.join(output_folder, "words.txt"), word) elif classifier.classify(word) == 'pinyin': utils.push_word_back(os.path.join(output_folder, "pinyin.txt"), word) else: pass
def get_bag_of_symbols(max_len, string): words = get_words(string) bag = [] window_size = 1 while window_size <= max_len: for s_i in range(0, len(words) - window_size + 1): symbol = [] for win_i in range(s_i, s_i + window_size): symbol.append(words[win_i]) bag.append(symbol) window_size = window_size + 1 return bag
def __init__(self, board=None): if board is None: words = get_words() sample = random.sample(words, 25) labels = Board.get_label_list() self.board = [ Word(word, label) for (word, label) in zip(sample, labels) ] else: self.board = board self.starting_team = Tags.BLUE if len(self.get_words(Tags.BLUE)) > len( self.get_words(Tags.RED)) else Tags.RED
def frequency_analysis(): global result words = utils.get_words(search_string) frequencies = utils.get_frequencies() input_frequencies = utils.get_frequencies(words, "input") input_first_letter = input_frequencies[0] for i in frequencies: delta = ord(input_first_letter[0]) - ord(i[0]) check_with_shift(delta) if result.percentage == 100: break
def do_preprocess(self, url_list, label_list): """ 进行预处理 :param url_list: :param label_list: :return: """ if MIN_WORD_FREQ > 0: x__, word_reverse_dict = get_word_vocab(url_list, MAX_LENGTH_WORDS, MIN_WORD_FREQ) self.high_freq_words = sorted(list(word_reverse_dict.values())) self.x, self.word_reverse_dict = get_word_vocab( url_list, MAX_LENGTH_WORDS) word_x = get_words(self.x, self.word_reverse_dict, DELIMIT_MODE, url_list) self.ngramed_id_x, self.ngrams_dict, self.worded_id_x, self.words_dict = \ ngram_id_x(word_x, MAX_LENGTH_SUBWORDS, self.high_freq_words) self.chars_dict = self.ngrams_dict self.chared_id_x = get_char_id_x(url_list, self.chars_dict, MAX_LENGTH_CHARS) pos_x, neg_x = list(), list() for index in range(len(label_list)): label = label_list[index] if label == 1: pos_x.append(index) else: neg_x.append(index) print("Overall Mal/Ben split: {}/{}".format(len(pos_x), len(neg_x))) pos_x = np.array(pos_x) neg_x = np.array(neg_x) self.x_train, self.y_train, self.x_test, self.y_test = prep_train_test( pos_x, neg_x, DEV_PERCENTAGE) self.x_train_char = get_ngramed_id_x(self.x_train, self.ngramed_id_x) self.x_test_char = get_ngramed_id_x(self.x_test, self.ngramed_id_x) self.x_train_word = get_ngramed_id_x(self.x_train, self.worded_id_x) self.x_test_word = get_ngramed_id_x(self.x_test, self.worded_id_x) self.x_train_char_seq = get_ngramed_id_x(self.x_train, self.chared_id_x) self.x_test_char_seq = get_ngramed_id_x(self.x_test, self.chared_id_x) self.dump_dict(self.ngrams_dict, NGRAMS_DICT_FILE) self.dump_dict(self.words_dict, WORDS_DICT_FILE) self.dump_dict(self.chars_dict, CHARS_DICT_FILE)
def __read_documents(self, id_class, class_path): size = len(os.listdir(class_path)) for name_document in os.listdir(class_path): text_document = utils.read_file( os.path.join(class_path, name_document)) word_list = utils.get_words(text_document) for word in word_list: if word in self.__classes[id_class]: self.__classes[id_class][word] += 1 else: self.__classes[id_class][word] = 1 for key in self.__classes[id_class].keys(): self.__classes[id_class][key] /= size
def __fill_class(self, id_class, name_document, text_document): word_list = utils.get_words(text_document) for word in word_list: tag = nltk.pos_tag([word]) if tag[0][1].startswith('N') or tag[0][1].startswith( 'V') or tag[0][1].startswith('S') or tag[0][1].startswith( 'F') or tag[0][1].startswith('J'): if word in self.__classes[id_class]: if name_document in self.__classes[id_class][word]: self.__classes[id_class][word][name_document] += 1 else: self.__classes[id_class][word][name_document] = 1 else: self.__classes[id_class][word] = dict() self.__classes[id_class][word][name_document] = 1
def compute_p_class_given_input(input_path, p_word_given_class, p_class): """ Return P(class | input). """ message = open_file(input_path) words = get_words(message) p_class_given_input = 0.0 for word in words: if (word in p_word_given_class): p_class_given_input += ln(p_word_given_class[word] * p_class) else: p_class_given_input += ln(p_word_given_class['UNKNOWN_WORD'] * p_class) return p_class_given_input
def do_preprocess(self, url_list): """ 测试数据预处理 :param url_list: :return: """ self.chars_dict = self.load_dict(CHARS_DICT_FILE) self.ngrams_dict = self.load_dict(NGRAMS_DICT_FILE) self.words_dict = self.load_dict(WORDS_DICT_FILE) x, word_reverse_dict = get_word_vocab(url_list, MAX_LENGTH_WORDS) word_x = get_words(x, word_reverse_dict, DELIMIT_MODE, url_list) self.ngramed_id_x, self.worded_id_x = \ ngram_id_x_from_dict(word_x, MAX_LENGTH_SUBWORDS, self.ngrams_dict, self.words_dict) self.chared_id_x = get_char_id_x(url_list, self.chars_dict, MAX_LENGTH_CHARS)
def analyze_text(self, text): words = get_words(text) char_count = get_char_count(words) word_count = len(words) sentence_count = len(get_sentences(text)) syllable_count = count_syllables(words) complexwords_count = count_complex_words(text) avg_words_p_sentence = word_count/sentence_count self.analyzedVars = { 'words': words, 'char_cnt': float(char_count), 'word_cnt': float(word_count), 'sentence_cnt': float(sentence_count), 'syllable_cnt': float(syllable_count), 'complex_word_cnt': float(complexwords_count), 'avg_words_p_sentence': float(avg_words_p_sentence) }
def analyze_text(self, text): words = get_words(text) char_count = get_char_count(words) words_count = len(words) sentence_count = len(get_sentences(text)) syllable_count = count_syllables(words) print("syllable_count:", syllable_count) complex_words_count = count_complex_words(text) avg_words_per_sentence = int(words_count / sentence_count) print("avg_words_per_sentence", avg_words_per_sentence) self.ana_vars = { 'words': words, 'char_count': float(char_count), 'words_count': float(words_count), 'sentence_count': float(sentence_count), 'syllable_count': float(syllable_count), 'complex_words_count': float(complex_words_count), 'avg_words_per_sentence': float(avg_words_per_sentence) }
def analyze_text(self, text): words = get_words(text) char_count = int(get_char_count(words)) word_count = int(len(words)) sentence_count = int(len(get_sentences(text))) syllable_count = count_syllables(words) complexwords_count = count_complex_words(text) avg_words_p_sentence = word_count / sentence_count self.analyzedVars = { 'filename': self.filename, # 'text_truncated': text[:200].replace("\n", " "), 'words': words, 'char_cnt': float(char_count), 'word_cnt': float(word_count), 'sentence_cnt': float(sentence_count), 'syllable_cnt': float(syllable_count), 'complex_word_cnt': float(complexwords_count), 'avg_words_p_sentence': float(avg_words_p_sentence) }
def compute_p_class_given_input(input_path, p_word_given_class, p_class): """ Return P(class | input). """ ########################################################### # Implement your solution here ########################################################### words = get_words(open_file(input_path)) p_class_given_input = 0 for word in words: prob = p_word_given_class["UNKNOWN_WORD"] if word in p_word_given_class: prob = p_word_given_class[word] p_class_given_input += math.log(prob) p_class_given_input += math.log(p_class) return p_class_given_input
def create_corpus(corpus_path): if os.path.exists(corpus_path): print(f'Using corpus {corpus_path}') else: print(f'Creating corpus {corpus_path}') word_list = [w.lower() for w in get_words(prefix='../')] replacements = {} for word in word_list: if ' ' in word: replacements[word] = word.replace(' ', '_') corpus_files = [] for path, dirs, files in os.walk(CORPUS_FILE_PATH): corpus_files = [f for f in files if f.endswith('.gz')] break random.shuffle(corpus_files) sentences = [] for corpus_file in tqdm(corpus_files): with gzip.open(f'{CORPUS_FILE_PATH}/{corpus_file}', 'rb') as f_in: for line in f_in: line = line.decode('utf-8').rstrip() for r in replacements: line = re.sub(rf'\b{r}\b', replacements[r], line) sentences.append(line) sentence_order = list(range(len(sentences))) random.shuffle(sentence_order) print('writing corpus to file...') f_out = gzip.open(corpus_path, 'wb') for i in sentence_order: f_out.write((sentences[i] + '\n').encode('utf-8')) f_out.close() print('done writing corpus to file')
def flesch_kincaid_score(article): xml_url = '&titles='.join([xml_api_url, title]) try: xml = requests.get(xml_url).content bs = BeautifulSoup(xml) try: text = str(bs.find('extract').contents[0].encode('utf-8')) # convert NavigableString to string after encoding non_text = ['== See also ==\n', '== References ==\n', ' === Further references ===\n', '== External links ==\n', '== Notes ==\n'] for ele in non_text: text = text.split(ele, 1)[0] text = re.sub('==.*==', '', text) words = get_words(text) syllableCount = count_syllables(text) sentences = get_sentences(text) fk = 206.835 - 1.015*len(words)/len(sentences) - 84.6*(syllableCount)/len(words) return float(format(fk,'.2f')) except: print 'Error while computing fk score of ' + article print format_exc() except: print 'Error while fetching xml content of ' + article print format_exc()
def main(): words = get_words()[:100] item = words[0] hashtable = HashTable(words, hashsize=10) print hashtable print hashtable.search(item)
import graph import codecs import utils import rhymes import generator def get_args(): parser = argparse.ArgumentParser(description="Generate a nice poem :)") parser.add_argument('--source_text', default='data/PanTadeusz.txt') parser.add_argument('--syllable_count', type=int, default=13) parser.add_argument('--rhyme_span', type=int, default=2) parser.add_argument('--length', type=int, default=4) parser.add_argument('--markov_order', type=int, default=1) # parser.add_argument('--rhyme_pattern') # parser.add_argument('--keyword_file') return parser.parse_args() if __name__ == '__main__': args = get_args() words = [] with codecs.open(args.source_text, 'rb', encoding='utf8') as f: for l in f: words += utils.get_words(l.rstrip()) wg = graph.from_file(words, args.markov_order) rhs = rhymes.from_file(words, args.rhyme_span) poem = generator.create_poem(wg, rhs, args.syllable_count, args.length)
unique_values = defaultdict( set ) n = 0 for line in reader: for i in indexes2binarize: value = line[i] if pass_filter( value, headers[i] ): unique_values[i].add( value ) # the same, but first get unique words from the column text # could also use non-unique words, as unique_values[i] is a set # using approach no2 for i in indexes2tokenize: text = line[i] words = get_words( text ) # filter words = filter_words( words, headers[i] ) for w in words: unique_values[i].add( w ) n += 1 if n % 10000 == 0: print n for i in unique_values: print "%s: %s" % ( i, len( unique_values[i] )) # calculate column offsets