class HapaxLegomera(BaseEstimator, TransformerMixin): def __init__(self): self.TK = NISTTokenizer() self.punct = re.compile('^[^a-zA-Z0-9_]$') def compile_counts(self, X, *_): word_counts = Counter() for sent in X: tokens = self.TK.tokenize(sent, lowercase=True) for i, token in enumerate(tokens): if not self.punct.match(token): word_counts.update([token]) return word_counts def fit(self, X, *_): return self def transform(self, X, *_): word_counts = self.compile_counts(X) result = [] for sent in X: features = defaultdict(int) tokens = self.TK.tokenize(sent, lowercase=True) for i, token in enumerate(tokens): if not self.punct.match(token): if word_counts[token] == 1: features['hapax_legomera'] += 1 elif word_counts[token] == 2: features['hapax_dislegomera'] += 1 result.append(features) return result
def transform(self, X, y=None): awl = [] tokenizer = NISTTokenizer() for i in range(X.shape[0]): tokens = tokenizer.tokenize(X[i, :][0], lowercase=True) awl.append(np.mean([len(w) for w in tokens if not _punctuation.match(w)])) return np.array(awl).reshape(-1, 1)
def tokenise(caption, lower = True): # import the NIST tokenizer nist = NISTTokenizer() if lower: caption = caption.lower() caption = nist.tokenize(caption) return caption
def transform(self, X, y=None): fsf = [] tokenizer = NISTTokenizer() for i in range(X.shape[0]): tokens = tokenizer.tokenize(X[i, :][0], lowercase=True) fsf.append(len(list(filter(lambda x: x[1] == '.', nltk.pos_tag(tokens))))) return np.array(fsf).reshape(-1, 1)
def transform(self, X, y=None): sliw = [] tokenizer = NISTTokenizer() for i in range(X.shape[0]): tokens = tokenizer.tokenize(X[i, :][0], lowercase=True) sliw.append(len(tokens)) return np.array(sliw).reshape(-1, 1)
def transform(self, X, y=None): ndw = [] tokenizer = NISTTokenizer() for i in range(X.shape[0]): tokens = tokenizer.tokenize(X[i, :][0], lowercase=True) ndw.append(len(set([w for w in tokens if not _punctuation.match(w)]))) return np.array(ndw).reshape(-1, 1)
def transform(self, X, y=None): hl = [] tokenizer = NISTTokenizer() for i in range(X.shape[0]): tokens = tokenizer.tokenize(X[i, :][0], lowercase=True) c = Counter([w for w in tokens if not _punctuation.match(w)]) hl.append(len([w for w, c in c.items() if c == 1])) return np.array(hl).reshape(-1, 1)
def word_tokenize(sentence): tokenizer = NISTTokenizer() sentence = ' '.join(tokenizer.tokenize(sentence)) # Rejoin special tokens that where tokenized by error: e.g. "<PERSON_1>" -> "< PERSON _ 1 >" for match in re.finditer(r'< (?:[A-Z]+ _ )+\d+ >', sentence): sentence = sentence.replace(match.group(), ''.join(match.group().split())) return sentence
def __init__(self, char_level=False, strip_punctuation=False, ngram_range=(1, 1)): self.TK = NISTTokenizer() self.word_index = dict() self.index_word = dict() self.strip_punctuation = strip_punctuation self.punct = re.compile('^[^a-zA-Z0-9_]$')
def build_word_list(): nist = NISTTokenizer() L = sys.stdin.read() stop_words = set(stopwords.words('english')) words = (nist.tokenize(L, lowercase=True)) words = [word for word in words if not word in stop_words] words = [word for word in words if word.isalpha()] words = [word for word in words if len(word) > 3 & len(word) < 9] return words
def __old__get_train_test_split(corpus, annotations, n_splits=5, train_test_split=0.8, cutoff=3): annotations_no_tweet = annotations.drop(labels='Tweet', axis=1) to_drop = annotations_no_tweet[annotations_no_tweet.sum( axis=1) > cutoff].index annotations.drop(labels=to_drop, axis=0, inplace=True) train_idx = set() test_idx = set() for i in range(1, cutoff + 1, 1): an = annotations[annotations.sum(axis=1) == i] train_sample = an.sample(frac=train_test_split) train_idx.update(train_sample.index) test_idx.update(set(an.index).difference(train_sample.index)) annotations.at[train_idx, 'set'] = 'train' annotations.at[test_idx, 'set'] = 'test' kf = KFold(n_splits=n_splits) train_data = annotations[annotations.set == 'train'] for i, (train_idx, test_idx) in enumerate(kf.split(train_data)): fold_id = 'fold_{}'.format(i + 1) annotations[fold_id] = None col_id = annotations.columns.get_loc(fold_id) annotations.iloc[train_idx, col_id] = 'train' annotations.iloc[test_idx, col_id] = 'test' tokenizer = NISTTokenizer() annotations.Tweet = annotations.Tweet.apply( lambda x: ' '.join(tokenizer.tokenize(x, lowercase=True))) annotations.to_csv('moral-dataset-{}.csv'.format(corpus)) elmo = hub.Module('https://tfhub.dev/google/elmo/2', trainable=False) embeds = [] for i in range(0, annotations.shape[0] // 100 + 1): print('Computing embeddings for [{} .. {})'.format( i * 100, (i + 1) * 100)) with tf.Session() as session: session.run(tf.global_variables_initializer()) tweets = annotations[['Tweet']].iloc[(i * 100):(i + 1) * 100, :] if tweets.shape[0] > 0: elmo_tweet_embeddings = session.run( elmo(tf.squeeze(tf.cast(tweets.values, tf.string)), signature='default', as_dict=True)['default']) embeds.append( pd.DataFrame(index=tweets.index, data=elmo_tweet_embeddings)) all_embeds = pd.concat(embeds, 0) all_embeds.to_csv('moral-dataset-{}_elmo_embeddings.csv'.format(corpus))
def train(factrueval2016_devset_dir: str, split_by_paragraphs: bool, elmo_will_be_tuned: bool, max_epochs: int, batch_size: int, lr: float, gpu_memory_frac: float, model_name: str) -> ELMo_NER: if os.path.isfile(model_name): with open(model_name, 'rb') as fp: recognizer = pickle.load(fp) assert isinstance(recognizer, ELMo_NER) print('The NER has been successfully loaded from the file `{0}`...'. format(model_name)) print('') else: temp_json_name = tempfile.NamedTemporaryFile(mode='w').name try: factrueval2016_to_json(factrueval2016_devset_dir, temp_json_name, split_by_paragraphs) X, y = load_dataset(temp_json_name) finally: if os.path.isfile(temp_json_name): os.remove(temp_json_name) print('Data for training have been loaded...') print('Number of samples is {0}.'.format(len(y))) print('') max_number_of_tokens = 0 tokenizer = NISTTokenizer() for cur in X: n_tokens = len(tokenizer.international_tokenize(cur)) if n_tokens > max_number_of_tokens: max_number_of_tokens = n_tokens del tokenizer print('Maximal number of tokens is {0}.'.format(max_number_of_tokens)) n_tokens = 2 while n_tokens < max_number_of_tokens: n_tokens *= 2 elmo_hub_module_handle = 'http://files.deeppavlov.ai/deeppavlov_data/elmo_ru-news_wmt11-16_1.5M_steps.tar.gz' recognizer = ELMo_NER(finetune_elmo=elmo_will_be_tuned, batch_size=batch_size, l2_reg=1e-3, max_seq_length=n_tokens, elmo_hub_module_handle=elmo_hub_module_handle, validation_fraction=0.25, max_epochs=max_epochs, patience=5, gpu_memory_frac=gpu_memory_frac, verbose=True, random_seed=42, lr=lr) recognizer.fit(X, y) with open(model_name, 'wb') as fp: pickle.dump(recognizer, fp) print('') print( 'The NER has been successfully fitted and saved into the file `{0}`...' .format(model_name)) print('') return recognizer
class WordTokenizer2(BaseEstimator, TransformerMixin): def __init__(self, char_level=False, strip_punctuation=False, ngram_range=(1, 1)): self.TK = NISTTokenizer() self.word_index = dict() self.index_word = dict() self.strip_punctuation = strip_punctuation self.punct = re.compile('^[^a-zA-Z0-9_]$') def fit(self, X, *_): i = 1 for sent in X: tokens = self.TK.tokenize(sent, lowercase=True) for t in tokens: if self.strip_punctuation: if not self.punct.match(t): if t not in self.word_index: self.word_index[t] = i self.index_word[i] = t i += 1 else: if t not in self.word_index: self.word_index[t] = i self.index_word[i] = t i += 1 return self def transform(self, X, *_): #returns sequence of form [1,2,3,4] sequences = [] for sent in X: seq = [] tokens = self.TK.tokenize(sent, lowercase=True) for t in tokens: if self.strip_punctuation: if not self.punct.match(t): if t in self.word_index: seq.append(self.word_index[t]) else: if t in self.word_index: seq.append(self.word_index[t]) sequences.append(seq) return sequences
def get_train_test_split(corpus, annotations, n_splits=5, train_test_split=0.8, cutoff=3): annotations_no_tweet = annotations.drop(labels='Tweet', axis=1) grps = annotations_no_tweet.apply(lambda v: ''.join(map(str, v)), axis=1).to_frame(0).groupby(0)[0] test_idx = grps.apply(lambda g: g.sample(frac=1 - train_test_split) ).index.get_level_values(1) train_idx = set(annotations_no_tweet.index).difference(test_idx) annotations.at[train_idx, 'set'] = 'train' annotations.at[test_idx, 'set'] = 'test' train_grps = annotations_no_tweet.loc[train_idx, :].apply(lambda v: ''.join(map(str, v)), axis=1) \ .to_frame(0).groupby(0)[0] for i in range(n_splits): fold_test_idx = train_grps.apply( lambda g: g.sample(frac=1 / n_splits)).index.get_level_values(1) fold_train_idx = set(train_idx).difference(fold_test_idx) fold_id = 'fold_{}'.format(i + 1) annotations[fold_id] = None annotations.loc[fold_train_idx, fold_id] = 'train' annotations.loc[fold_test_idx, fold_id] = 'test' tokenizer = NISTTokenizer() annotations.Tweet = annotations.Tweet.apply( lambda x: ' '.join(tokenizer.tokenize(x, lowercase=True))) annotations.to_csv('moral-dataset-{}.csv'.format(corpus)) elmo = hub.Module('https://tfhub.dev/google/elmo/2', trainable=False) embeds = [] for i in range(0, annotations.shape[0] // 100 + 1): print('Computing embeddings for [{} .. {})'.format( i * 100, (i + 1) * 100)) with tf.Session() as session: session.run(tf.global_variables_initializer()) tweets = annotations[['Tweet']].iloc[(i * 100):(i + 1) * 100, :] if tweets.shape[0] > 0: elmo_tweet_embeddings = session.run( elmo(tf.squeeze(tf.cast(tweets.values, tf.string)), signature='default', as_dict=True)['default']) embeds.append( pd.DataFrame(index=tweets.index, data=elmo_tweet_embeddings)) all_embeds = pd.concat(embeds, 0) all_embeds.to_csv('moral-dataset-{}_elmo_embeddings.csv'.format(corpus))
class NltkNistTokenizer(Tokenizer): def __init__(self) -> None: super().__init__() self._base_tokenizer = NISTTokenizer() def tokenize_text(self, text: str) -> List[str]: return self._base_tokenizer.tokenize(text)
def get_nist_tokenizer(): # Inline lazy import because importing nltk is slow try: from nltk.tokenize.nist import NISTTokenizer except LookupError: import nltk nltk.download('perluniprops') return NISTTokenizer()
def score(self, document, sentenceTokenizer=punkt_tokenizer, wordTokenizer=NISTTokenizer()): assert self.__wordIdf is not None, "Cannot score the model before fitting" word_tfidf = self.__computeTfIdf(document, wordTokenizer) centroid_sentence = self.__computeCentroidSentence(word_tfidf) sentences = sentenceTokenizer.tokenize(document) score_dicts = self.__scoreSentencesAgainstCentroid( centroid_sentence, sentences, wordTokenizer) return score_dicts
def computeWordIdf(documents, wordTokenizer=NISTTokenizer()): tot_document_count = 0 total_word_count = Counter() for document in documents: words = wordTokenizer.tokenize(document, lowercase=True) doc_word_count = Counter(set(words)) total_word_count += doc_word_count tot_document_count += 1 word_idf = defaultdict(int) for (word, count) in total_word_count.items(): word_idf[word] = log(tot_document_count / count) return word_idf
class WordIndexer(BaseEstimator, TransformerMixin): """ code modified from https://github.com/adventuresinML/adventures-in-ml-code/blob/master/keras_lstm.py """ def __init__(self, reverse=False): self.TK = NISTTokenizer() self.word2idx = None self.sent_size = 0 def build_vocab(self, X, *_): counter = Counter() max_len = 0 for sent in X: tokens = self.TK.tokenize(sent, lowercase=True) if len(tokens) > max_len: max_len = len(tokens) counter.update(tokens) sort_by_counts = sorted(counter.items(), key=lambda x: x[1]) words, counts = zip(*sort_by_counts) word2idx = dict(zip(words, range(1, len(words) + 1))) return word2idx, max_len def fit(self, X, *_): self.word2idx, self.sent_size = self.build_vocab(X) return self def transform(self, X, *_): vec = np.zeros((len(X), self.sent_size + 25)) for i, sent in enumerate(X): tokens = self.TK.tokenize(sent, lowercase=True) for j, tok in enumerate(tokens): vec[i][j] = self.word2idx[tok] return vec
def process_tweets(pair_id, target_pair, n_splits): target_pair['set'] = None target_pair['set'][target_pair['Test/Train/Dev'].isin(['Train', 'Dev'])] = 'train' target_pair['set'][pd.isnull(target_pair['set'])] = 'test' kf = KFold(n_splits=n_splits) target_pair_train = target_pair[target_pair.set == 'train'] for i, (train_idx, test_idx) in enumerate(kf.split(target_pair_train)): fold_id = 'fold_{}'.format(i+1) target_pair[fold_id] = None target_pair[fold_id].iloc[train_idx] = 'train' target_pair[fold_id].iloc[test_idx] = 'test' tokenizer = NISTTokenizer() target_pair.Tweet = target_pair.Tweet.apply( lambda x: ' '.join(tokenizer.tokenize(x, lowercase=True))) target_pair.rename(columns={'Stance 1': 'Target 1', 'Stance 2': 'Target 2'}, inplace=True) target_pair.to_csv('tweets-{}.csv'.format(pair_id)) elmo = hub.Module('https://tfhub.dev/google/elmo/2', trainable=False) embeds = [] print('There are {} tweets in pair: '.format(target_pair.shape[0], pair_id)) for i in range(0, target_pair.shape[0] // 100 + 1): print('Computing embeddings for [{} .. {})'.format(i*100, (i+1)*100)) with tf.Session() as session: session.run(tf.global_variables_initializer()) tweets = target_pair[['Tweet']].iloc[(i*100):(i+1)*100, :] if tweets.shape[0] > 0: elmo_tweet_embeddings = session.run(elmo(tf.squeeze(tf.cast(tweets.values, tf.string)), signature='default', as_dict=True)['default']) embeds.append(pd.DataFrame(index=tweets.index, data=elmo_tweet_embeddings)) all_embeds = pd.concat(embeds, 0) print('There are {} embeddings in pair: '.format(all_embeds.shape[0], pair_id)) all_embeds.to_csv('tweets-{}_elmo_embeddings.csv'.format(pair_id))
def loadData(dir_path): data = AutoVivification() for root, dirs, files in os.walk(dir_path): for file_name in files: if not file_name.endswith('.xml'): continue suffix = file_name.find('.xml') # language of the data lang = lang_map[file_name[:suffix].split('_')[2].lower()] # data type: train/dev/test data_type = os.path.basename(root) # data domain: restaurant, laptop, ... domain = os.path.basename(os.path.dirname(root)) # subtask task = os.path.basename(os.path.dirname(os.path.dirname(root))) tokenizer = NISTTokenizer() tree = ET.parse(os.path.join(root, file_name)) revs = tree.getroot() for rev in revs: for sents in rev: for sent in sents: text = None ops = [] for c in sent: if c.tag == 'text': #text = tokenizer.tokenize(c.text, escape = False, return_str = True) text = tokenizer.tokenize(c.text, return_str=True) elif c.tag == 'Opinions': for op in c: ops.append(op.attrib) if not ops: continue data[data_type][lang][task][domain][text] = ops return data['train'], data['dev'], data['test']
from nltk.translate.nist_score import sentence_nist from nltk.tokenize.nist import NISTTokenizer ntok = NISTTokenizer() def compute_nist(hypothesis, references): hypothesis = list(ntok.tokenize(hypothesis)) references = [list(ntok.tokenize(reference)) for reference in references] return sentence_nist(references, hypothesis)
def __init__(self): self.TK = NISTTokenizer() self.punct = re.compile('^[^a-zA-Z0-9_]$')
class SentenceFeatures(BaseEstimator, TransformerMixin): """ Extract sentence features in format supporting Pipelines. Uses the top 10 discriminating features from Simaki (2018)) paper: 'Evaluating stance-annotated sentences from the Brexit Blog Corpus: A quantitative linguistic analysis' These are: 1. Average word length 2. Conjunction frequency 3. Sentence length in words 4. Comma frequency 5. Full stop frequency 6. Hapax Legomena (number of words appearing in utterance only once) 7. Number of different words used 8. Sentence length in characters 9. Punctuation frequency 10. Hapax dislegomena (number of words appearing in utterance only twice) """ def __init__(self): self.TK = NISTTokenizer() self.punct = re.compile('^[^a-zA-Z0-9_]$') def fit(self, *_): return self def transform(self, X, *_): result = [] for sent in X: #print(sent) features = defaultdict(int) num_words = len(sent.split()) tokens = self.TK.tokenize(sent, lowercase=True) tags = nltk.pos_tag((tokens)) features['sent length/words'] = num_words counts = Counter() for i, token in enumerate(tokens): if self.punct.match(token): features['punctuation'] += 1 if token == ',': features['comma'] += 1 if token == '.': features['period'] += 1 else: if tags[i][1] == 'CC': features['conjunctions'] += 1 num_chars = len(re.sub(r'\W', '', token)) features['mean word length'] += num_chars features['sent length/chars'] += num_chars counts.update([token]) features['mean word length'] /= num_words features['hapax legomera'] = sum( [1 for k, v in counts.items() if v == 1]) features['hapax dislegomera'] = sum( [1 for k, v in counts.items() if v == 2]) #print(counts) features['different words'] = len(counts.keys()) result.append(features) #print(features) return result
seq2seq = t.train( seq2seq, train, num_epochs=6, optimizer=optimizer, teacher_forcing_ratio=0.6, teacher_forcing_half_life=5000, resume=opt.resume, ) predictor = Predictor(seq2seq, input_vocab, output_vocab) loss, acc = Evaluator(loss=loss).evaluate( seq2seq, torchtext.data.TabularDataset(path=opt.test_path, format="tsv", fields=[("src", src), ("tgt", tgt)]), ) logging.info("Loss: {}, Acc: {}".format(loss, acc)) import nltk nltk.download('perluniprops') from nltk.tokenize.nist import NISTTokenizer nist = NISTTokenizer() while True: seq_str = input("Type in a source sequence:") seq = nist.tokenize(seq_str.strip(), lowercase=False) print(predictor.predict(seq))
def get_nist_tokenizer(): return NISTTokenizer()
def main(download_settings_filename, parse_settings_filename): with open(download_settings_filename, 'r') as f: download_config = json.load(f) with open(parse_settings_filename, 'r') as f: parse_config = json.load(f) topic = download_config.get('topic', 'Medicine') data_dir = os.path.join( download_config.get('save_dir', os.path.join('data', 'wiki')), topic) save_dir = os.path.join( parse_config.get('save_dir', os.path.join('artifacts', 'wiki')), topic, 'vocab') exclude_vocab = parse_config.get('exclude_vocab', []) min_page_vocab = parse_config.get('min_page_vocab', 5) plot_top_k = parse_config.get('plot_top_k', 40) plot_cumulative = parse_config.get('plot_cumulative', True) plot_title = 'top {} frequency'.format( plot_top_k) if not plot_cumulative else 'top {} cumulative'.format( plot_top_k) make_plots = plot_top_k > 0 wiki_url = 'https://en.wikipedia.org/wiki/Category:{}'.format(topic) sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle').tokenize word_tokenizer = NISTTokenizer().tokenize lem = nltk.WordNetLemmatizer() S = requests.Session() pages = glob(os.path.join(data_dir, '*.html')) total_vocab = FreqDist() document_vocabs = {} print('reading {} files and generating vocabulary'.format(len(pages))) os.makedirs(save_dir, exist_ok=True) for page in tqdm(pages): l = process_page(S, page, exclude_vocab, word_tokenizer, lem, sent_tokenizer) # ignore pages with very small vocabulary if len(l) < min_page_vocab: continue document_vocabs[page] = FreqDist(l) total_vocab.update(l) save_filename = os.path.join( save_dir, os.path.basename(page[:page.rfind('.')]) + '.json') with open(save_filename, 'w') as f: json.dump(dict(document_vocabs[page]), f, indent=4) if make_plots: save_filename = save_filename[:save_filename.rfind('.')] + '.pdf' save_freq_plot(save_filename, document_vocabs[page], max_num=plot_top_k, cumulative=plot_cumulative, title=plot_title) with open(os.path.join(save_dir, 'total_count.json'), 'w') as f: json.dump(dict(total_vocab), f, indent=4) if make_plots: save_filename = os.path.join(save_dir, 'total_count.pdf') save_freq_plot(save_filename, total_vocab, max_num=plot_top_k, cumulative=plot_cumulative, title=plot_title)
from wikibags.models import WikiArticle from wordindex.tasks import populate_from_bag import requests from bs4 import BeautifulSoup import re from nltk.tokenize.nist import NISTTokenizer from django.db.utils import IntegrityError ENDPOINT = "https://en.wikipedia.org/w/api.php?action=parse&{key}={wiki_id}&format=json" WIKI_PAGE = "https://en.wikipedia.org/wiki/{name}/" NIST = NISTTokenizer() def get_article_tokens(data): try: html = data['parse']['text']['*'] except KeyError: raise ValueError("Invalid wiki json") soup = BeautifulSoup(html, 'lxml') text_tags = soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5']) for tag in text_tags: text = tag.get_text(separator=' ') text = re.sub(r"\[[\ ]{0,}[0-9|edit|citation needed]{1,}[\ ]{0,}\]", "", text)
def nist_tokenize(sentence): nist = NISTTokenizer() return ' '.join(nist.tokenize(sentence))
def fit(self, documents, wordTokenizer=NISTTokenizer()): self.__wordIdf = computeWordIdf(documents, wordTokenizer)