def vectorize_dataset(dataset, word2idx, memory_size, sentence_length): def word2idx_func(x): return word2idx.get(x, 0) def pad_2d_to(width, array): d1, d2 = abs(width[0] - array.shape[0]), abs(width[1] - array.shape[1]) return np.pad(array, ((0, d1), (0, d2)), 'constant') def pad_1d_to(width, array): d = abs(width - array.shape[0]) return np.pad(array, ((0, d)), 'constant') N = len(dataset) facts = np.zeros((N, memory_size, sentence_length)) query = np.zeros((N, sentence_length)) answer = np.zeros((N)) for idx, (fcts, q, a) in enumerate(dataset): facts[idx] = pad_2d_to([memory_size, sentence_length], np.vstack([ pad_1d_to( sentence_length, np.fromiter( map(word2idx_func, tokenize(f)), np.int32)) for f in fcts ])[-memory_size:]) query[idx] = pad_1d_to( sentence_length, np.fromiter(map(word2idx_func, tokenize(q)), np.int32)) answer[idx] = word2idx_func(a) return facts, query, answer
def get_norm_words(string): if stem: return [ stemmer.stem(word) for word in tokenizer.tokenize(string.lower()) if not word in stop_words ] else: return tokenizer.tokenize(string)
def main(ignore_len=3, report_crs=False): for line in fileinput.input(): src, trg, crs = diff2before_after(line.strip(), report_crs) before = tokenize(src) after = tokenize(trg) if len(before) > ignore_len and len(after) > ignore_len: print(src, file=sys.stderr) print(trg) if report_crs: with open("crs.txt", "w", encoding="utf-8") as outfile: outfile.write(str(crs))
def get_token_splitter(self, type="unigram"): """ Returns a "tokenisation" function, but potentially also for bigrams, or for both unigrams and bigrams. """ if type == "unigram": return lambda s: tokenize(s) elif type == "bigram": return bigram_splitter elif type == "both": # concatenation of both unigrams and bigrams return lambda s: tokenize(s) + bigram_splitter(s) else: return lambda s: []
def cleanSW(inputSen): tokens = tokenize(inputSen) clean_tokens = [ x for x in tokens if not x in stopwords.words() ] # clean all the words with not much meaning in the sentence like 'a', 'is' return clean_tokens
def create_vocabulary(directory): tokens = [ token for f in os.listdir(directory) for token in tokenize(open(os.path.join(directory, f)).read()) if not token.isdigit() ] return {v: k for k, v in enumerate(set(tokens), start=1)}
def main(): NOVELS_DIR = 'Limpios' novelas = os.listdir(pathlib.Path(NOVELS_DIR)) stemmer = SnowballStemmer("english") lemmer = WordNetLemmatizer() sw = stopwords.words('english') corpus = open('corpus_total.txt', 'w', encoding='utf8') improcesables = [] for novela in novelas: print('procesando {}'.format(novela)) novela_path = pathlib.Path(NOVELS_DIR, novela) try: titulo = novela_path.stem with open(novela_path) as f: libro = f.read() libro = libro.lower() libro = libro.strip().split() libro = ' '.join(libro) tokens = (w for w in tokenize(libro) if w not in sw) lemas = (lemmer.lemmatize(tok).lower() for tok in tokens if tok.isalpha()) tokenizada = ' '.join(lemma for lemma in lemas).strip() # tokenizada = ' '.join(x for x in novel_words(f, lemmer, sw)) corpus.write('{} {}\n'.format(titulo, tokenizada)) except UnicodeDecodeError: improcesables.append(novela) corpus.close()
def readdataset(p, wdic, maxlen=100): dataret = [] goldret = [] toolong = 0 realmaxlen = 0 wdic[None] = masksym with open(p) as f: data = csv.reader(f, delimiter=",") for row in data: rowelems = tokenize(row[2]) realmaxlen = max(realmaxlen, len(rowelems)) if len(rowelems) > maxlen: toolong += 1 for rowelem in set(rowelems): if rowelem not in wdic: wdic[rowelem] = len(wdic) dataret.append([wdic[x] for x in rowelems]) goldret.append(row[0]) print("{} comments were too long".format(toolong)) maxlen = min(maxlen, realmaxlen) datamat = np.ones((len(dataret) - 1, maxlen)).astype("int32") * masksym for i in range(1, len(dataret)): datamat[i - 1, :min(len(dataret[i]), maxlen )] = dataret[i][:min(len(dataret[i]), maxlen)] return datamat, np.asarray(goldret[1:], dtype="int32"), wdic
def predict(name, command): command = command.lower() label_path = path.join(path.dirname(path.realpath(__file__)), "intents", "config", "labels", "%s_labels.json" % name) with open(label_path, encoding="utf8") as f: labels = json.load(f) word_vocab = Vocabulary() word_vocab.load("%s_word_vocab.json" % name) #char embedding char_vocab = Vocabulary() char_vocab.load("%s_char_vocab.json" % name) idx2label = dict((idx, label) for idx, label in enumerate(labels)) preprocessor = Preprocessor(word_vocab, None, char_vocab) model = BiLSTMCRF(labels, len(word_vocab), len(char_vocab)) model.load_weights('intents/config/weights/%s.hdf5' % name) sentence = tokenize(command) features = preprocessor.transform([sentence]) p = model.predict(features) predicted_labels = [] for pred in p: predicted_labels.append(idx2label[pred]) for word, label in zip(sentence, predicted_labels): print('%s: %s' % (word, label))
def embedding_matrix(raw_data, embs, fixed_len): ''' Expects list of strings. Turns a list of linguistic units (sentences, texts,...) into a 3d tensor, such that each unit is represented by a matrix of concatenated embeddings of the words in this unit. ''' matrices = [] for sent in raw_data: sent = [str.lower() for str in tokenize(sent)] # print(sent) features = [embs.represent(str) for str in sent] # now pads the left margin zeros = np.zeros((fixed_len, embs.dim)) i = 1 while i <= fixed_len and i <= len(features): zeros[-i, :] = features[-i] i += 1 # features=np.stack(features, axis=0) # # print(features) # # print(features.shape) # features=__pad_array__(features, fixed_len) # print(zeros) matrices.append(zeros) return np.stack(matrices, axis=0)
def clean_data(text, entry): sentences = texts # Converting all the text to lower cases text = sentences.lower() # Converting all negative contractions words for t in NEG_CONTRACTIONS: sentences = re.sub(t[0], t[1], text) # Converting the sentences into specific tokens tokens = tokenize(sentences) # converting some other contractions such as 'm as m tokens = [ OTHER_CONTRACTIONS[token] if OTHER_CONTRACTIONS.get(token) else token for token in tokens ] ENGLISH_STOPWORDS = set(stopwords.words('english')) remove_punc = r'[a-z]+' # Removing all the punctuations from the tokens tokens = [word for word in tokens if re.search(remove_punc, word)] # Removing all the English Stop words tokens = [token for token in tokens if token not in ENGLISH_STOPWORDS] # Stemming the Tokens stemmer = nltk.PorterStemmer() tokens = [stemmer.stem(word) for word in tokens] # Lemming the tokens wordnet_lemmatizer = WordNetLemmatizer() tokens = [wordnet_lemmatizer.lemmatize(word) for word in tokens] return tokens
def features(self): """Trivially tokenized words.""" words = tokenize(self.data.lower()) puncs = [p for p in string.punctuation] + ['``', "''"] stops = stopwords.words('english') words = [w for w in words if w not in stops + puncs] return set(words)
def novel_words(novel, lemmer, sw): for line in novel: line = ' '.join(line.split()).strip() tokens = (w for w in tokenize(line) if w not in sw) nuevos = (lemmer.lemmatize(tok).lower() for tok in tokens if tok.isalpha()) yield ' '.join(nuevos)
async def process_sentence(doc: SentenceDocument): tokenized_sent = tokenize(doc.sentence) doc = TokenizedSentenceDocument(sent_tokens=[ tokenized_sent, ], metadata='Single sentence') return process_tokenized_sentence_document(doc)
def perform_sentiment_analysis(txt): """ This function is responsible for the sentiment analysis portion of the program. It first loads the saved and trained classification model into the program to be used. The function then uses the trained Naive-Bayes Classifier model to do Sentiment Analysis on 50 randomly selected Tweets out of the Tweets from the user's query in order to analyze the tone of these Tweets and make a prediction as to whether it is positive or negative. Each result is printed along with its related string to the. screen. Finally, the overall sentiment is determined by calculation the percent positive sentiment results over the 50 randomly selected Tweets. Arguments: text -- The tweet texts used for sentiment analysis by the model Returns: N/A """ print("\n\t{}".format("----" * 16)) print("\tSentiment Analysis:\n") c = pickle.load(open("classifier1.pickle", "rb")) print("\tUsing Naive-Bayes Classification Model to Analyze Tweets: \n") random.shuffle(txt) cl = [c.classify({x: True for x in clean(tokenize(p))}) for p in txt[:50]] [t.twt_print("\t({}) {}".format(a, p)) for a, p in zip(cl[:50], txt[:50])] pos_cnt = cl.count("+") neg_cnt = cl.count("-") sent = determine_sentiment(pos_cnt, neg_cnt) print("\n\tOverall Sentiment: {} ({}/{} +)".format(sent, pos_cnt, len(cl))) print("\n\t{}\n".format("----" * 16))
def stemming(inputSen): sentence = tokenize(inputSen) for x in sentence: inputSen = inputSen.replace( x, PorterStemmer().stem( x)) # stemming, remove suffixes e.g. playing and play return inputSen
def tokenizeDocs(): spellIndex, spellTitles, spellTags, spells, spellSource, spellDescription = collectDocs( ) gen_docs = [[w.lower() for w in tokenize(text)] for text in spellTags] return spellIndex, spellTitles, gen_docs, spells, spellSource, spellDescription
def create_keyword_regex(keyword): # import nltk ensure_package_path() from nltk.tokenize import wordpunct_tokenize as tokenize tokens = tokenize(keyword) pattern = '\\s+'.join(tokens) pattern = '\\b%s\\b' % pattern return re.compile(pattern, re.I | re.UNICODE)
def load_semeval_sents(filename): sents = [] for review in Parser.parse(filename).getroot().findall('.//sentence'): sent = tokenize(review.find('text').text) sent = to_lower(sent) sent = filter_symbol(sent) sents.append(sent) return sents
def getTF(path): stops = stopwords.words('english') punctuations = [ '(', ')', ';', ':', '[', ']', ',', '.', '!', '\"', '#', '$', '%', '&', '\'', '*', '+', '-', '/', '<', '=', '>', '?', '@', '\\', '^', '_', '`', '{', '|', '}', '~' ] remove_digits = str.maketrans('', '', digits) dic = {} tf = {} wordz = set() tokens = [] try: text = textract.process(path).decode().translate(remove_digits) except Exception as identifier: print('--Err: FAILED TO PARSE ' + path + ' in normal mode, trying OCR') try: text = textract.process( path, method='tesseract').decode().translate(remove_digits) except Exception as identifier: print('--Err: FAILED TO PARSE ' + path + ' even in OCR mode, skipping') return dict(), 1 #return set({}), {} tempTokens = tokenize(text) if len(tempTokens) == 0: print('--Err: FAILED TO PARSE ' + path + ' in normal mode, trying OCR') try: text = textract.process( path, method='tesseract').decode().translate(remove_digits) except Exception as identifier: print('--Err: FAILED TO PARSE ' + path + ' even in OCR mode, skipping') return dict(), 1 for word in tempTokens: if word.lower() not in stops and word.lower( ) not in punctuations and len(word) > 1: tokens.append(word) for word in tokens: if dic.__contains__(word): dic[word] = dic[word] + 1 else: dic[word] = 1 counter1 = 0 for key, value in sorted(dic.items(), key=lambda item: item[1], reverse=True): if counter1 >= 200: break tf[key] = value #wordz.add(key) counter1 += 1 return tf, 0
def readAndDis(): writeString = "" with open( 'D:/SKOLE/MASTER 2016/testing/Testing database/100URL-target-context.txt', encoding='utf8') as fp: for line in fp: testLineArr = line.split('|') context = testLineArr[2] disWord = testLineArr[1] # print(context+":"+disWord) # context = "The British were the first to introduce armored vehicles, in 1916 -- the term tank was actually a code word intended to fool eavesdropping Germans into thinking they were discussing (inordinately deadly) water tanks. Even then, the Brits relied heavily on horses to move artillery and supplies, drafting more than a million of them to slog through the muddy trenches of Belgium and France." # print(pos(tokenize(context))) # print("The sentence : ", context, "\n########") regex = re.compile('[^a-zA-Z]') #First parameter is the replacement, second parameter is your input string regex.sub(' ', context) context1 = findNouns(context) #Print which word from the context that you want to disambiguate # print("write your word \n") wordPos = pos(tokenize(disWord)) print(wordPos) if wordPos[0][1][0] == 'V': targetWordSynsets = wn.synsets(disWord, pos=wn.VERB) else: targetWordSynsets = wn.synsets(disWord, pos=wn.NOUN) if targetWordSynsets is None: return print(targetWordSynsets) # targetWordSynsets[0].pos #Run the program timer start = timeit.timeit() synsetHashValues = disambiguationAlgo(targetWordSynsets, context1) end = timeit.timeit() print("\n###\nTime used in algorithm : " + str(end - start) + " seconds\n###\n") print("Based on the context :") # print("---".join(context1), "\n") writeString += "\n" if isinstance(synsetHashValues, dict): for key, value in sorted(synsetHashValues.items(), key=operator.itemgetter(1), reverse=True): syns = key.name() print(key.name() + " : " + str(value) + "\n") writeString += key.name() + " : " + str(value) + "\n" writeString += "\n" else: writeString += synsetHashValues.name() + "\n" return writeString
def segment(self, doc): raw_sentences = doc.split("\n") sentences = [] for sentence in raw_sentences: # cur_sentence = sentence.split(". ") cur_sentence = nltk_segment(sentence) if len(cur_sentence) > 0: sentences += cur_sentence tokenized_sentences = [tokenize(sentence) for sentence in sentences] return (sentences, tokenized_sentences)
def tokenizer(mode, lowercase=False): if mode == 'char': if lowercase: tokenizer = (lambda s: list(s.strip().lower())) else: tokenizer = (lambda s: list(s.strip())) elif (mode == 'space') or (mode == 'bpe'): if lowercase: tokenizer = (lambda s: s.lower().split()) else: tokenizer = str.split elif mode == 'word': if lowercase: tokenizer = (lambda s: tokenize(s.lower())) else: tokenizer = (lambda s: tokenize(s)) else: raise ValueError('Unknown tokenizer: "%s"' % mode) return tokenizer
def single_score_fn(s): s = tokenize(s) count_male_pronouns = sum(s.count(p) for p in male_pronouns) count_female_pronouns = sum(s.count(p) for p in female_pronouns) if count_male_pronouns > count_female_pronouns: return self.POSITIVE if self.config["scorer_attribute"] == "male" else self.NEGATIVE if count_female_pronouns > count_male_pronouns: return self.POSITIVE if self.config["scorer_attribute"] == "female" else self.NEGATIVE else: ## equal return self.POSITIVE if self.config["scorer_attribute"] == "other" else self.NEGATIVE
def findNouns(context): tokenized = tokenize(context) sentence = pos(tokenized) properNouns = [ word for word, pos in sentence if pos == 'NN' or pos == 'NNS' or pos == 'NNP' or pos == 'NNPS' ] print("List over The nouns ") # print(properNouns) print("\n") return properNouns
def create_keyword_regex(keyword): print 'create_keyword_regex' # import nltk ensure_package_path() from nltk.tokenize import wordpunct_tokenize as tokenize print 'tokenize ==> %s' % (keyword) tokens = tokenize(keyword) pattern = '\\s+'.join(tokens) pattern = '\\b%s\\b' % pattern print 'compile pattern ==> %s' % (pattern) return re.compile(pattern, re.I | re.UNICODE)
def gensim_indexer(embeddings, doc, ignore=True): # type: (KeyedVectors, str, bool) -> Iterator[int] for word in tokenize(doc): try: yield embeddings.vocab[word.lower()].index except KeyError: if ignore: pass else: raise
def make_ngrams(corpus, **kwargs): n = kwargs.get('n', 1) ncursor = 1 stem = kwargs.get('stem', True) ngrams = tokenize(corpus) if stem: ngrams = [stemmer.stem(ngram) for ngram in ngrams] ncursor += 1 while ncursor <= n: ngrams += ngramify(ngrams, n) ncursor += 1 return ngrams
def encode_to_iob(self, sentence, entities): """ Extract IOB labels for a given sentence Args: - sentence: list of tokens - entities: dict of slot => entity items Returns: list of IOB labels """ # base data structures: list of labels (target) iob_labels = ['_O' for i in range(len(sentence))] # first step: get base labels from slot entities for slot in entities: for token in tokenize(entities[slot]): # TODO: if not in tokens? if token in sentence: i = sentence.index(token) iob_labels[i] = slot # redefine each label as either a beginning or inside tag # convert into list of tuples to make it easier to distinguish between single and sequence entities grouped_labels = [] group = [] prev_label = None for label in iob_labels: if label == prev_label: group.append(label) else: if prev_label: grouped_labels.append(group) group = [label] prev_label = label if group: grouped_labels.append(group) # extract groups into one sequence labels = [] for g in grouped_labels: if g[0] == '_O': labels += g elif len(g) == 1: labels.append(g[0] + '_I') else: labels.append(g[0] + '_B') labels += [x + '_I' for x in g[1:]] return labels
def bigram_splitter(text: str): """ Returns list of bigrams in given text. """ tokens = tokenize(text) if len(tokens) < 2: bigrams = [] else: bigrams = [ tokens[i] + " " + tokens[i + 1] for i in range(len(tokens) - 1) ] return bigrams
def analyser(documents): porter = PorterStemmer() stop_words = set(stopwords.words('english')) modified_arr = [[porter.stem(i.lower()) for i in tokenize(d.translate(None, string.punctuation)) if i.lower() not in stop_words] for d in documents] modified_doc = [' '.join(i) for i in modified_arr] tf_idf = TfidfVectorizer().fit_transform(modified_doc) for i in xrange(len(documents) / 2): minimum = (1, None) for j in xrange(len(documents) / 2, len(documents)): minimum = min((cosine(tf_idf[i].todense(), tf_idf[j].todense()), j - len(documents) / 2), minimum) print minimum[1] + 1
def transfer_vec(txts, wv_model, padding=10, dim=300): """ transfer :param txts: a list of txt :param wv_model: :return: list of vector """ if type(txts) in (list, tuple): vec = [] for sentence in txts: sen_vec = [] for word in tokenize(sentence.lower()): try: sen_vec.append(wv_model[word]) except KeyError as e: sen_vec.append(np.random.rand(300)) while len(sen_vec) < padding: sen_vec.append(np.zeros(dim, dtype=np.float)) if len(sen_vec) > padding: sen_vec = sen_vec[:padding] vec.append(np.array(sen_vec, dtype='f')) return vec elif type(txts) == str: sen_vec = [] for word in tokenize(txts.lower()): try: sen_vec.append(wv_model[word]) except KeyError as e: sen_vec.append(np.random.rand(300)) while len(sen_vec) < padding: sen_vec.append(np.zeros(dim, dtype=np.float)) if len(sen_vec) > padding: sen_vec = sen_vec[:padding] return [np.array(sen_vec, dtype='f')] else: raise TypeError('%s is not in support type' % str(type(txts)))
def computeSentiment(tweet_text): pos_count = 0 neg_count = 0 pos_terms = [] neg_terms = [] st = EnglishStemmer() tokenized_tweet = tokenize(tweet_text) for t in tokenized_tweet: #print st.stem(t.lower()) if st.stem(t.lower()) in negative_terms: neg_terms.append(t.lower()) neg_count += 1 elif st.stem(t.lower()) in positive_terms: pos_terms.append(t.lower()) pos_count += 1 return pos_count, neg_count, set(pos_terms), set(neg_terms)
def computeSentiment(tweet_text): annotated = '' positive = 0 negative = 0 st = EnglishStemmer() tokenized_tweet = tokenize(tweet_text) for t in tokenized_tweet: #print st.stem(t.lower()) wsp = ' ' if len(annotated) == 0 or annotated[-1] in '@#': wsp = '' if st.stem(t.lower()) in negative_terms: annotated += wsp+'<span class="negative">'+t+'</span>' negative += 1 elif st.stem(t.lower()) in positive_terms: annotated += wsp+'<span class="positive">'+t+'</span>' positive += 1 else: if len(t) == 1 and t not in '@#': annotated += t else: annotated += wsp + t return annotated, positive, negative
def extract_words(msg): ttl_words = set(tokenize(msg.replace('=\\n', '').lower())) final_words = [word for word in ttl_words if word not in stopword and len(word) >= 3] final_words = [word for word in final_words if word!="Subject:"] return final_words
with open('sword.set', 'rb') as f: sword_list = load(f) with open('phrase.set','rb') as f: phrase_list = load(f) problem = lil_matrix((5000, 17173)) n = 0 for i, tfile in enumerate(train_files): if i < 2500: fdir = pos_dir else: fdir = neg_dir with open(fdir+tfile) as f: text = f.read() tokens = tokenize(text) fphrases = phrases(tokens) for token in tokens: if token in sword_list: ind = sword_list.index(token) problem[i, ind] = 1 for p in fphrases: if p in phrase_list: ind = phrase_list.index(p) + 3111 problem[i, ind] = 1 with open('problem.matrix', 'wb') as f: dump(problem, f)
def tag(self, tense): """Does translation from tag generated by tagger into unified format Args: sentence: list of touple (word and its form) which are after verb Returns: list of touple (word and its form in unified format) """ words = self.__utag.tag(tokenize(tense)) for i, (word, form) in enumerate(words): word_info = {} if form[0] == 'V': word_info['klasa'] = 'czasownik' elif form[0] == 'S': word_info['klasa'] = 'rzeczownik' elif form[0] == 'A': word_info['klasa'] = 'przymiotnik' elif form[0] == 'N': word_info['klasa'] = 'liczebnik' elif form[0] == 'Z': word_info['klasa'] = 'zaimek' elif form[0] == 'D': word_info['klasa'] = 'przysłówek' elif form[0] == 'P': word_info['klasa'] = 'przyimek' elif form[0] == 'C': word_info['klasa'] = 'spójnik' elif form[0] == 'I': word_info['klasa'] = 'wykrzyknik' elif form[0] == 'T': word_info['klasa'] = 'partykuła' else: word_info['klasa'] = 'nieznany' if form[1] == 'S': word_info['liczba'] = 'pojedyńcza' elif form[1] == 'P': word_info['liczba'] = 'mnoga' if(len(form) >= 3): if form[2] == 'N': word_info['przypadek'] = 'mianownik' elif form[2] == 'G': word_info['przypadek'] = 'dopełniacz' elif form[2] == 'D': word_info['przypadek'] = 'celownik' elif form[2] == 'A': word_info['przypadek'] = 'biernik' elif form[2] == 'I': word_info['przypadek'] = 'narzędnik' elif form[2] == 'L': word_info['przypadek'] = 'miejscownik' elif form[2] == 'V': word_info['przypadek'] = 'wołacz' if(len(form) >= 4): if form[3] == 'M': word_info['rodzaj'] = 'm' elif form[3] == 'P': word_info['rodzaj'] = 'm' elif form[3] == 'A': word_info['rodzaj'] = 'm' elif form[3] == 'I': word_info['rodzaj'] = 'm' elif form[3] == 'F': word_info['rodzaj'] = 'ż' elif form[3] == 'N': word_info['rodzaj'] = 'n' elif form[3] == 'O': word_info['rodzaj'] = 'm' elif form[3] == 'R': word_info['rodzaj'] = 'ż' elif form[3] == 'T': word_info['rodzaj'] = 'ż' if(len(form) >= 6): if form[5] == '1': word_info['osoba'] = 'pierwsza' elif form[5] == '2': word_info['osoba'] = 'druga' elif form[5] == '3': word_info['osoba'] = 'trzecia' elif form[5] == 'I': word_info['osoba'] = 'bezokolicznik' elif form[5] == 'B': word_info['osoba'] = 'bezosobnik' elif form[5] == 'U': word_info['osoba'] = 'imiesłów' elif form[5] == 'W': word_info['osoba'] = 'imiesłów' if(len(form) >= 7): if form[6] == 'T': word_info['czas'] = 'teraźniejszy' elif form[6] == 'P': word_info['czas'] = 'przeszły' elif form[6] == 'F': word_info['czas'] = 'przyszły' if(len(form) >= 8): if form[7] == 'O': word_info['tryb'] = 'oznajmujący' elif form[7] == 'P': word_info['tryb'] = 'przypuszczający' elif form[7] == 'R': word_info['tryb'] = 'rozkazujący' if(len(form) >= 9): if form[8] == 'D': word_info['aspekt'] = 'dokonane' elif form[8] == 'N': word_info['aspekt'] = 'niedokonane' words[i] = (words[i][0], word_info) return words
return c defined_words = set() freq = {} total_tokens = 0 total_defs = 0 with codecs.open(data_filepath, 'r', 'utf-8') as ifp: for line in ifp: total_defs = total_defs + 1 line = line.strip() parts = line.split('\t') if parts[0] not in freq: freq[parts[0]] = 0 freq[parts[0]] = freq[parts[0]] + 1 defined_words.add(parts[0]) for t in tokenize(parts[3]): if t not in freq: freq[t] = 0 freq[t] = freq[t] + 1 total_tokens = total_tokens + 1 print('#word being defined: ' + str(len(defined_words))) print('#definition: ' + str(total_defs)) print('#tokens: ' + str(total_tokens)) print('vocab size: ' + str(len(freq))) print('rare word frequency: ') print(' - 1: ' + str(num_words_with_freq(freq, 1))) print(' - 2: ' + str(num_words_with_freq(freq, 2))) print(' - 3: ' + str(num_words_with_freq(freq, 3))) print(' - 4: ' + str(num_words_with_freq(freq, 4))) print(' - 5: ' + str(num_words_with_freq(freq, 5)))
def _tokenize(self, tense, isPl): if isPl: return self.tagger.tag(tense) else: return self.__utag.tag(tokenize(tense))
def tokenize_sentence(sentence): ###################################### # Splits article into sentences # ###################################### return ' '.join(list(tokenize(sentence)))