Exemplos de NISTTokenizer em Python, exemplos de nltk.tokenize.nist.NISTTokenizer em Python

Exemplo n.º 1

0

Exibir arquivo

class HapaxLegomera(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.TK = NISTTokenizer()
        self.punct = re.compile('^[^a-zA-Z0-9_]$')

    def compile_counts(self, X, *_):
        word_counts = Counter()
        for sent in X:
            tokens = self.TK.tokenize(sent, lowercase=True)

            for i, token in enumerate(tokens):
                if not self.punct.match(token):
                    word_counts.update([token])

        return word_counts

    def fit(self, X, *_):
        return self

    def transform(self, X, *_):
        word_counts = self.compile_counts(X)
        result = []
        for sent in X:
            features = defaultdict(int)
            tokens = self.TK.tokenize(sent, lowercase=True)
            for i, token in enumerate(tokens):
                if not self.punct.match(token):
                    if word_counts[token] == 1:
                        features['hapax_legomera'] += 1
                    elif word_counts[token] == 2:
                        features['hapax_dislegomera'] += 1
            result.append(features)
        return result

Exemplo n.º 2

0

Exibir arquivo

 def transform(self, X, y=None):
     awl = []
     tokenizer = NISTTokenizer()
     for i in range(X.shape[0]):
         tokens = tokenizer.tokenize(X[i, :][0], lowercase=True)
         awl.append(np.mean([len(w) for w in tokens if not _punctuation.match(w)]))
     return np.array(awl).reshape(-1, 1)

Exemplo n.º 3

0

Exibir arquivo

def tokenise(caption, lower = True):
    # import the NIST tokenizer    
    nist = NISTTokenizer() 
    if lower:
        caption = caption.lower()    
    caption = nist.tokenize(caption)
    return caption

Exemplo n.º 4

0

Exibir arquivo

 def transform(self, X, y=None):
     fsf = []
     tokenizer = NISTTokenizer()
     for i in range(X.shape[0]):
         tokens = tokenizer.tokenize(X[i, :][0], lowercase=True)
         fsf.append(len(list(filter(lambda x: x[1] == '.', nltk.pos_tag(tokens)))))
     return np.array(fsf).reshape(-1, 1)

Exemplo n.º 5

0

Exibir arquivo

 def transform(self, X, y=None):
     sliw = []
     tokenizer = NISTTokenizer()
     for i in range(X.shape[0]):
         tokens = tokenizer.tokenize(X[i, :][0], lowercase=True)
         sliw.append(len(tokens))
     return np.array(sliw).reshape(-1, 1)

Exemplo n.º 6

0

Exibir arquivo

 def transform(self, X, y=None):
     ndw = []
     tokenizer = NISTTokenizer()
     for i in range(X.shape[0]):
         tokens = tokenizer.tokenize(X[i, :][0], lowercase=True)
         ndw.append(len(set([w for w in tokens if not _punctuation.match(w)])))
     return np.array(ndw).reshape(-1, 1)

Exemplo n.º 7

0

Exibir arquivo

 def transform(self, X, y=None):
     hl = []
     tokenizer = NISTTokenizer()
     for i in range(X.shape[0]):
         tokens = tokenizer.tokenize(X[i, :][0], lowercase=True)
         c = Counter([w for w in tokens if not _punctuation.match(w)])
         hl.append(len([w for w, c in c.items() if c == 1]))
     return np.array(hl).reshape(-1, 1)

Exemplo n.º 8

0

Exibir arquivo

Arquivo: text.py Projeto: facebookresearch/access

def word_tokenize(sentence):
    tokenizer = NISTTokenizer()
    sentence = ' '.join(tokenizer.tokenize(sentence))
    # Rejoin special tokens that where tokenized by error: e.g. "<PERSON_1>" -> "< PERSON _ 1 >"
    for match in re.finditer(r'< (?:[A-Z]+ _ )+\d+ >', sentence):
        sentence = sentence.replace(match.group(),
                                    ''.join(match.group().split()))
    return sentence

Exemplo n.º 9

0

Exibir arquivo

 def __init__(self,
              char_level=False,
              strip_punctuation=False,
              ngram_range=(1, 1)):
     self.TK = NISTTokenizer()
     self.word_index = dict()
     self.index_word = dict()
     self.strip_punctuation = strip_punctuation
     self.punct = re.compile('^[^a-zA-Z0-9_]$')

Exemplo n.º 10

0

Exibir arquivo

Arquivo: dwgen.py Projeto: pgoodall/dwgen

def build_word_list():
    nist = NISTTokenizer()
    L = sys.stdin.read()
    stop_words = set(stopwords.words('english'))
    words = (nist.tokenize(L, lowercase=True))
    words = [word for word in words if not word in stop_words]
    words = [word for word in words if word.isalpha()]
    words = [word for word in words if len(word) > 3 & len(word) < 9]
    return words

Exemplo n.º 11

0

Exibir arquivo

Arquivo: prepare_mftc_dataset.py Projeto: willferreira/multilabel-stance-detection

def __old__get_train_test_split(corpus,
                                annotations,
                                n_splits=5,
                                train_test_split=0.8,
                                cutoff=3):
    annotations_no_tweet = annotations.drop(labels='Tweet', axis=1)
    to_drop = annotations_no_tweet[annotations_no_tweet.sum(
        axis=1) > cutoff].index
    annotations.drop(labels=to_drop, axis=0, inplace=True)

    train_idx = set()
    test_idx = set()

    for i in range(1, cutoff + 1, 1):
        an = annotations[annotations.sum(axis=1) == i]
        train_sample = an.sample(frac=train_test_split)
        train_idx.update(train_sample.index)
        test_idx.update(set(an.index).difference(train_sample.index))

    annotations.at[train_idx, 'set'] = 'train'
    annotations.at[test_idx, 'set'] = 'test'

    kf = KFold(n_splits=n_splits)
    train_data = annotations[annotations.set == 'train']
    for i, (train_idx, test_idx) in enumerate(kf.split(train_data)):
        fold_id = 'fold_{}'.format(i + 1)
        annotations[fold_id] = None
        col_id = annotations.columns.get_loc(fold_id)
        annotations.iloc[train_idx, col_id] = 'train'
        annotations.iloc[test_idx, col_id] = 'test'

    tokenizer = NISTTokenizer()
    annotations.Tweet = annotations.Tweet.apply(
        lambda x: ' '.join(tokenizer.tokenize(x, lowercase=True)))

    annotations.to_csv('moral-dataset-{}.csv'.format(corpus))

    elmo = hub.Module('https://tfhub.dev/google/elmo/2', trainable=False)

    embeds = []
    for i in range(0, annotations.shape[0] // 100 + 1):
        print('Computing embeddings for [{} .. {})'.format(
            i * 100, (i + 1) * 100))
        with tf.Session() as session:
            session.run(tf.global_variables_initializer())
            tweets = annotations[['Tweet']].iloc[(i * 100):(i + 1) * 100, :]
            if tweets.shape[0] > 0:
                elmo_tweet_embeddings = session.run(
                    elmo(tf.squeeze(tf.cast(tweets.values, tf.string)),
                         signature='default',
                         as_dict=True)['default'])
                embeds.append(
                    pd.DataFrame(index=tweets.index,
                                 data=elmo_tweet_embeddings))
    all_embeds = pd.concat(embeds, 0)
    all_embeds.to_csv('moral-dataset-{}_elmo_embeddings.csv'.format(corpus))

Exemplo n.º 12

0

Exibir arquivo

def train(factrueval2016_devset_dir: str, split_by_paragraphs: bool,
          elmo_will_be_tuned: bool, max_epochs: int, batch_size: int,
          lr: float, gpu_memory_frac: float, model_name: str) -> ELMo_NER:
    if os.path.isfile(model_name):
        with open(model_name, 'rb') as fp:
            recognizer = pickle.load(fp)
        assert isinstance(recognizer, ELMo_NER)
        print('The NER has been successfully loaded from the file `{0}`...'.
              format(model_name))
        print('')
    else:
        temp_json_name = tempfile.NamedTemporaryFile(mode='w').name
        try:
            factrueval2016_to_json(factrueval2016_devset_dir, temp_json_name,
                                   split_by_paragraphs)
            X, y = load_dataset(temp_json_name)
        finally:
            if os.path.isfile(temp_json_name):
                os.remove(temp_json_name)
        print('Data for training have been loaded...')
        print('Number of samples is {0}.'.format(len(y)))
        print('')
        max_number_of_tokens = 0
        tokenizer = NISTTokenizer()
        for cur in X:
            n_tokens = len(tokenizer.international_tokenize(cur))
            if n_tokens > max_number_of_tokens:
                max_number_of_tokens = n_tokens
        del tokenizer
        print('Maximal number of tokens is {0}.'.format(max_number_of_tokens))
        n_tokens = 2
        while n_tokens < max_number_of_tokens:
            n_tokens *= 2
        elmo_hub_module_handle = 'http://files.deeppavlov.ai/deeppavlov_data/elmo_ru-news_wmt11-16_1.5M_steps.tar.gz'
        recognizer = ELMo_NER(finetune_elmo=elmo_will_be_tuned,
                              batch_size=batch_size,
                              l2_reg=1e-3,
                              max_seq_length=n_tokens,
                              elmo_hub_module_handle=elmo_hub_module_handle,
                              validation_fraction=0.25,
                              max_epochs=max_epochs,
                              patience=5,
                              gpu_memory_frac=gpu_memory_frac,
                              verbose=True,
                              random_seed=42,
                              lr=lr)
        recognizer.fit(X, y)
        with open(model_name, 'wb') as fp:
            pickle.dump(recognizer, fp)
        print('')
        print(
            'The NER has been successfully fitted and saved into the file `{0}`...'
            .format(model_name))
        print('')
    return recognizer

Exemplo n.º 13

0

Exibir arquivo

class WordTokenizer2(BaseEstimator, TransformerMixin):
    def __init__(self,
                 char_level=False,
                 strip_punctuation=False,
                 ngram_range=(1, 1)):
        self.TK = NISTTokenizer()
        self.word_index = dict()
        self.index_word = dict()
        self.strip_punctuation = strip_punctuation
        self.punct = re.compile('^[^a-zA-Z0-9_]$')

    def fit(self, X, *_):
        i = 1
        for sent in X:
            tokens = self.TK.tokenize(sent, lowercase=True)
            for t in tokens:
                if self.strip_punctuation:
                    if not self.punct.match(t):
                        if t not in self.word_index:
                            self.word_index[t] = i
                            self.index_word[i] = t
                            i += 1

                else:
                    if t not in self.word_index:
                        self.word_index[t] = i
                        self.index_word[i] = t
                        i += 1

        return self

    def transform(self, X, *_):

        #returns sequence of form [1,2,3,4]

        sequences = []
        for sent in X:
            seq = []
            tokens = self.TK.tokenize(sent, lowercase=True)
            for t in tokens:
                if self.strip_punctuation:
                    if not self.punct.match(t):
                        if t in self.word_index:
                            seq.append(self.word_index[t])

                else:
                    if t in self.word_index:
                        seq.append(self.word_index[t])

            sequences.append(seq)

        return sequences

Exemplo n.º 14

0

Exibir arquivo

Arquivo: prepare_mftc_dataset.py Projeto: willferreira/multilabel-stance-detection

def get_train_test_split(corpus,
                         annotations,
                         n_splits=5,
                         train_test_split=0.8,
                         cutoff=3):
    annotations_no_tweet = annotations.drop(labels='Tweet', axis=1)
    grps = annotations_no_tweet.apply(lambda v: ''.join(map(str, v)),
                                      axis=1).to_frame(0).groupby(0)[0]
    test_idx = grps.apply(lambda g: g.sample(frac=1 - train_test_split)
                          ).index.get_level_values(1)
    train_idx = set(annotations_no_tweet.index).difference(test_idx)

    annotations.at[train_idx, 'set'] = 'train'
    annotations.at[test_idx, 'set'] = 'test'

    train_grps = annotations_no_tweet.loc[train_idx, :].apply(lambda v: ''.join(map(str, v)), axis=1) \
        .to_frame(0).groupby(0)[0]
    for i in range(n_splits):
        fold_test_idx = train_grps.apply(
            lambda g: g.sample(frac=1 / n_splits)).index.get_level_values(1)
        fold_train_idx = set(train_idx).difference(fold_test_idx)
        fold_id = 'fold_{}'.format(i + 1)
        annotations[fold_id] = None
        annotations.loc[fold_train_idx, fold_id] = 'train'
        annotations.loc[fold_test_idx, fold_id] = 'test'

    tokenizer = NISTTokenizer()
    annotations.Tweet = annotations.Tweet.apply(
        lambda x: ' '.join(tokenizer.tokenize(x, lowercase=True)))

    annotations.to_csv('moral-dataset-{}.csv'.format(corpus))

    elmo = hub.Module('https://tfhub.dev/google/elmo/2', trainable=False)

    embeds = []
    for i in range(0, annotations.shape[0] // 100 + 1):
        print('Computing embeddings for [{} .. {})'.format(
            i * 100, (i + 1) * 100))
        with tf.Session() as session:
            session.run(tf.global_variables_initializer())
            tweets = annotations[['Tweet']].iloc[(i * 100):(i + 1) * 100, :]
            if tweets.shape[0] > 0:
                elmo_tweet_embeddings = session.run(
                    elmo(tf.squeeze(tf.cast(tweets.values, tf.string)),
                         signature='default',
                         as_dict=True)['default'])
                embeds.append(
                    pd.DataFrame(index=tweets.index,
                                 data=elmo_tweet_embeddings))
    all_embeds = pd.concat(embeds, 0)
    all_embeds.to_csv('moral-dataset-{}_elmo_embeddings.csv'.format(corpus))

Exemplo n.º 15

0

Exibir arquivo

class NltkNistTokenizer(Tokenizer):
    def __init__(self) -> None:
        super().__init__()
        self._base_tokenizer = NISTTokenizer()

    def tokenize_text(self, text: str) -> List[str]:
        return self._base_tokenizer.tokenize(text)

Exemplo n.º 16

0

Exibir arquivo

Arquivo: text.py Projeto: ibrahim85/text-simplification-evaluation

def get_nist_tokenizer():
    # Inline lazy import because importing nltk is slow
    try:
        from nltk.tokenize.nist import NISTTokenizer
    except LookupError:
        import nltk
        nltk.download('perluniprops')
    return NISTTokenizer()

Exemplo n.º 17

0

Exibir arquivo

Arquivo: centroid.py Projeto: rrajasek95/lexrank-demo

    def score(self,
              document,
              sentenceTokenizer=punkt_tokenizer,
              wordTokenizer=NISTTokenizer()):
        assert self.__wordIdf is not None, "Cannot score the model before fitting"
        word_tfidf = self.__computeTfIdf(document, wordTokenizer)
        centroid_sentence = self.__computeCentroidSentence(word_tfidf)
        sentences = sentenceTokenizer.tokenize(document)
        score_dicts = self.__scoreSentencesAgainstCentroid(
            centroid_sentence, sentences, wordTokenizer)

        return score_dicts

Exemplo n.º 18

0

Exibir arquivo

def computeWordIdf(documents, wordTokenizer=NISTTokenizer()):
    tot_document_count = 0
    total_word_count = Counter()
    for document in documents:
        words = wordTokenizer.tokenize(document, lowercase=True)
        doc_word_count = Counter(set(words))
        total_word_count += doc_word_count
        tot_document_count += 1
    word_idf = defaultdict(int)
    for (word, count) in total_word_count.items():
        word_idf[word] = log(tot_document_count / count)

    return word_idf

Exemplo n.º 19

0

Exibir arquivo

Arquivo: vectorizers.py Projeto: willferreira/Brexit-Corpus-Stance-Classification-Project

class WordIndexer(BaseEstimator, TransformerMixin):
    """
    code modified from https://github.com/adventuresinML/adventures-in-ml-code/blob/master/keras_lstm.py
    """

    def __init__(self, reverse=False):

        self.TK = NISTTokenizer()
        self.word2idx = None
        self.sent_size = 0

    def build_vocab(self, X, *_):
        counter = Counter()
        max_len = 0
        for sent in X:
            tokens = self.TK.tokenize(sent, lowercase=True)
            if len(tokens) > max_len:
                max_len = len(tokens)
            counter.update(tokens)

        sort_by_counts = sorted(counter.items(), key=lambda x: x[1])
        words, counts = zip(*sort_by_counts)

        word2idx = dict(zip(words, range(1, len(words) + 1)))
        return word2idx, max_len


    def fit(self, X, *_):
        self.word2idx, self.sent_size = self.build_vocab(X)
        return self

    def transform(self, X, *_):
        vec = np.zeros((len(X), self.sent_size + 25))
        for i, sent in enumerate(X):
            tokens = self.TK.tokenize(sent, lowercase=True)
            for j, tok in enumerate(tokens):
                vec[i][j] = self.word2idx[tok]
            return vec

Exemplo n.º 20

0

Exibir arquivo

def process_tweets(pair_id, target_pair, n_splits):
    target_pair['set'] = None
    target_pair['set'][target_pair['Test/Train/Dev'].isin(['Train', 'Dev'])] = 'train'
    target_pair['set'][pd.isnull(target_pair['set'])] = 'test'

    kf = KFold(n_splits=n_splits)
    target_pair_train = target_pair[target_pair.set == 'train']
    for i, (train_idx, test_idx) in enumerate(kf.split(target_pair_train)):
        fold_id = 'fold_{}'.format(i+1)
        target_pair[fold_id] = None
        target_pair[fold_id].iloc[train_idx] = 'train'
        target_pair[fold_id].iloc[test_idx] = 'test'

    tokenizer = NISTTokenizer()
    target_pair.Tweet = target_pair.Tweet.apply(
        lambda x: ' '.join(tokenizer.tokenize(x, lowercase=True)))

    target_pair.rename(columns={'Stance 1': 'Target 1',
                                'Stance 2': 'Target 2'}, inplace=True)
    target_pair.to_csv('tweets-{}.csv'.format(pair_id))

    elmo = hub.Module('https://tfhub.dev/google/elmo/2', trainable=False)

    embeds = []
    print('There are {} tweets in pair: '.format(target_pair.shape[0], pair_id))
    for i in range(0, target_pair.shape[0] // 100 + 1):
        print('Computing embeddings for [{} .. {})'.format(i*100, (i+1)*100))
        with tf.Session() as session:
            session.run(tf.global_variables_initializer())
            tweets = target_pair[['Tweet']].iloc[(i*100):(i+1)*100, :]
            if tweets.shape[0] > 0:
                elmo_tweet_embeddings = session.run(elmo(tf.squeeze(tf.cast(tweets.values, tf.string)),
                                                         signature='default', as_dict=True)['default'])
                embeds.append(pd.DataFrame(index=tweets.index, data=elmo_tweet_embeddings))
    all_embeds = pd.concat(embeds, 0)
    print('There are {} embeddings in pair: '.format(all_embeds.shape[0], pair_id))
    all_embeds.to_csv('tweets-{}_elmo_embeddings.csv'.format(pair_id))

Exemplo n.º 21

0

Exibir arquivo

Arquivo: semeval16t5_loader.py Projeto: zhu-y11/evaluation

def loadData(dir_path):
    data = AutoVivification()
    for root, dirs, files in os.walk(dir_path):
        for file_name in files:
            if not file_name.endswith('.xml'):
                continue
            suffix = file_name.find('.xml')
            # language of the data
            lang = lang_map[file_name[:suffix].split('_')[2].lower()]
            # data type: train/dev/test
            data_type = os.path.basename(root)
            # data domain: restaurant, laptop, ...
            domain = os.path.basename(os.path.dirname(root))
            # subtask
            task = os.path.basename(os.path.dirname(os.path.dirname(root)))

            tokenizer = NISTTokenizer()

            tree = ET.parse(os.path.join(root, file_name))
            revs = tree.getroot()
            for rev in revs:
                for sents in rev:
                    for sent in sents:
                        text = None
                        ops = []
                        for c in sent:
                            if c.tag == 'text':
                                #text = tokenizer.tokenize(c.text, escape = False, return_str = True)
                                text = tokenizer.tokenize(c.text,
                                                          return_str=True)
                            elif c.tag == 'Opinions':
                                for op in c:
                                    ops.append(op.attrib)
                        if not ops:
                            continue
                        data[data_type][lang][task][domain][text] = ops
    return data['train'], data['dev'], data['test']

Exemplo n.º 22

0

Exibir arquivo

from nltk.translate.nist_score import sentence_nist
from nltk.tokenize.nist import NISTTokenizer
ntok = NISTTokenizer()


def compute_nist(hypothesis, references):
    hypothesis = list(ntok.tokenize(hypothesis))

    references = [list(ntok.tokenize(reference)) for reference in references]

    return sentence_nist(references, hypothesis)

Exemplo n.º 23

0

Exibir arquivo

 def __init__(self):
     self.TK = NISTTokenizer()
     self.punct = re.compile('^[^a-zA-Z0-9_]$')

Exemplo n.º 24

0

Exibir arquivo

class SentenceFeatures(BaseEstimator, TransformerMixin):
    """
    Extract sentence features in format supporting Pipelines.

    Uses the top 10 discriminating features from Simaki (2018)) paper:
    'Evaluating stance-annotated sentences from the Brexit
    Blog Corpus: A quantitative linguistic analysis'

    These are:
    1. Average word length
    2. Conjunction frequency
    3. Sentence length in words
    4. Comma frequency
    5. Full stop frequency
    6. Hapax Legomena (number of words appearing in utterance only once)
    7. Number of different words used
    8. Sentence length in characters
    9. Punctuation frequency
    10. Hapax dislegomena (number of words appearing in utterance only twice)
    """
    def __init__(self):
        self.TK = NISTTokenizer()
        self.punct = re.compile('^[^a-zA-Z0-9_]$')

    def fit(self, *_):
        return self

    def transform(self, X, *_):
        result = []
        for sent in X:
            #print(sent)
            features = defaultdict(int)
            num_words = len(sent.split())
            tokens = self.TK.tokenize(sent, lowercase=True)
            tags = nltk.pos_tag((tokens))
            features['sent length/words'] = num_words
            counts = Counter()
            for i, token in enumerate(tokens):

                if self.punct.match(token):
                    features['punctuation'] += 1
                    if token == ',':
                        features['comma'] += 1
                    if token == '.':
                        features['period'] += 1

                else:
                    if tags[i][1] == 'CC':
                        features['conjunctions'] += 1

                    num_chars = len(re.sub(r'\W', '', token))
                    features['mean word length'] += num_chars
                    features['sent length/chars'] += num_chars
                    counts.update([token])

            features['mean word length'] /= num_words
            features['hapax legomera'] = sum(
                [1 for k, v in counts.items() if v == 1])
            features['hapax dislegomera'] = sum(
                [1 for k, v in counts.items() if v == 2])
            #print(counts)
            features['different words'] = len(counts.keys())
            result.append(features)
            #print(features)
        return result

Exemplo n.º 25

0

Exibir arquivo

Arquivo: train.py Projeto: li-kai/CS4246-AI-Planning-and-Decision-Making

    seq2seq = t.train(
        seq2seq,
        train,
        num_epochs=6,
        optimizer=optimizer,
        teacher_forcing_ratio=0.6,
        teacher_forcing_half_life=5000,
        resume=opt.resume,
    )

predictor = Predictor(seq2seq, input_vocab, output_vocab)
loss, acc = Evaluator(loss=loss).evaluate(
    seq2seq,
    torchtext.data.TabularDataset(path=opt.test_path,
                                  format="tsv",
                                  fields=[("src", src), ("tgt", tgt)]),
)
logging.info("Loss: {}, Acc: {}".format(loss, acc))

import nltk
nltk.download('perluniprops')

from nltk.tokenize.nist import NISTTokenizer
nist = NISTTokenizer()

while True:
    seq_str = input("Type in a source sequence:")
    seq = nist.tokenize(seq_str.strip(), lowercase=False)
    print(predictor.predict(seq))

Exemplo n.º 26

0

Exibir arquivo

def get_nist_tokenizer():
    return NISTTokenizer()

Exemplo n.º 27

0

Exibir arquivo

def main(download_settings_filename, parse_settings_filename):
    with open(download_settings_filename, 'r') as f:
        download_config = json.load(f)
    with open(parse_settings_filename, 'r') as f:
        parse_config = json.load(f)
    topic = download_config.get('topic', 'Medicine')
    data_dir = os.path.join(
        download_config.get('save_dir', os.path.join('data', 'wiki')), topic)
    save_dir = os.path.join(
        parse_config.get('save_dir', os.path.join('artifacts', 'wiki')), topic,
        'vocab')
    exclude_vocab = parse_config.get('exclude_vocab', [])
    min_page_vocab = parse_config.get('min_page_vocab', 5)
    plot_top_k = parse_config.get('plot_top_k', 40)
    plot_cumulative = parse_config.get('plot_cumulative', True)
    plot_title = 'top {} frequency'.format(
        plot_top_k) if not plot_cumulative else 'top {} cumulative'.format(
            plot_top_k)
    make_plots = plot_top_k > 0

    wiki_url = 'https://en.wikipedia.org/wiki/Category:{}'.format(topic)

    sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle').tokenize
    word_tokenizer = NISTTokenizer().tokenize
    lem = nltk.WordNetLemmatizer()

    S = requests.Session()

    pages = glob(os.path.join(data_dir, '*.html'))

    total_vocab = FreqDist()
    document_vocabs = {}
    print('reading {} files and generating vocabulary'.format(len(pages)))
    os.makedirs(save_dir, exist_ok=True)
    for page in tqdm(pages):
        l = process_page(S, page, exclude_vocab, word_tokenizer, lem,
                         sent_tokenizer)
        # ignore pages with very small vocabulary
        if len(l) < min_page_vocab:
            continue
        document_vocabs[page] = FreqDist(l)
        total_vocab.update(l)
        save_filename = os.path.join(
            save_dir,
            os.path.basename(page[:page.rfind('.')]) + '.json')
        with open(save_filename, 'w') as f:
            json.dump(dict(document_vocabs[page]), f, indent=4)
        if make_plots:
            save_filename = save_filename[:save_filename.rfind('.')] + '.pdf'
            save_freq_plot(save_filename,
                           document_vocabs[page],
                           max_num=plot_top_k,
                           cumulative=plot_cumulative,
                           title=plot_title)
    with open(os.path.join(save_dir, 'total_count.json'), 'w') as f:
        json.dump(dict(total_vocab), f, indent=4)
    if make_plots:
        save_filename = os.path.join(save_dir, 'total_count.pdf')
        save_freq_plot(save_filename,
                       total_vocab,
                       max_num=plot_top_k,
                       cumulative=plot_cumulative,
                       title=plot_title)

Exemplo n.º 28

0

Exibir arquivo

Arquivo: tasks.py Projeto: pengra/data

from wikibags.models import WikiArticle
from wordindex.tasks import populate_from_bag

import requests
from bs4 import BeautifulSoup

import re
from nltk.tokenize.nist import NISTTokenizer

from django.db.utils import IntegrityError

ENDPOINT = "https://en.wikipedia.org/w/api.php?action=parse&{key}={wiki_id}&format=json"
WIKI_PAGE = "https://en.wikipedia.org/wiki/{name}/"

NIST = NISTTokenizer()


def get_article_tokens(data):
    try:
        html = data['parse']['text']['*']
    except KeyError:
        raise ValueError("Invalid wiki json")

    soup = BeautifulSoup(html, 'lxml')
    text_tags = soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5'])

    for tag in text_tags:

        text = tag.get_text(separator=' ')
        text = re.sub(r"\[[\ ]{0,}[0-9|edit|citation needed]{1,}[\ ]{0,}\]",
                      "", text)

Exemplo n.º 29

0

Exibir arquivo

def nist_tokenize(sentence):
    nist = NISTTokenizer()
    return ' '.join(nist.tokenize(sentence))

Exemplo n.º 30

0

Exibir arquivo

Arquivo: centroid.py Projeto: rrajasek95/lexrank-demo

 def fit(self, documents, wordTokenizer=NISTTokenizer()):
     self.__wordIdf = computeWordIdf(documents, wordTokenizer)