示例#1
0
def detect_freqwords_method(text):
    words = set()
    for sentence in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sentence):
            if word not in punctuation:
                words.add(word)
    if len(words.intersection(top_n_list('de', 30))) > len(words.intersection(top_n_list('es', 30))):
        return "German"
    else:
        return "Spanish"
示例#2
0
def eval_analogies(frame):
    filename = get_support_data_filename('google-analogies/questions-words.txt')
    quads = read_google_analogies(filename)
    vocab = [
        standardized_uri('en', word)
        for word in wordfreq.top_n_list('en', 200000)
    ]
    wrap = VectorSpaceWrapper(frame=frame)
    vecs = np.vstack([wrap.get_vector(word) for word in vocab])
    tframe = pd.DataFrame(vecs, index=vocab)
    total = 0
    correct = 0
    seen_mistakes = set()
    for quad in quads:
        prompt = quad[:3]
        answer = quad[3]
        vector = analogy_func(frame, *prompt)
        similar = similar_to_vec(tframe, vector)
        result = None
        for match in similar.index:
            if match not in prompt:
                result = match
                break
        if result == answer:
            correct += 1
        else:
            if result not in seen_mistakes:
                print(
                    "%s : %s :: %s : [%s] (should be %s)"
                    % (quad[0], quad[1], quad[2], result, answer)
                    )
                seen_mistakes.add(result)
        total += 1
    low, high = proportion_confint(correct, total)
    return pd.Series([correct / total, low, high], index=['acc', 'low', 'high'])
示例#3
0
def eval_analogies(frame):
    filename = get_support_data_filename('google-analogies/questions-words.txt')
    quads = read_google_analogies(filename)
    vocab = [
        standardized_uri('en', word)
        for word in wordfreq.top_n_list('en', 200000)
    ]
    wrap = VectorSpaceWrapper(frame=frame)
    vecs = np.vstack([wrap.get_vector(word) for word in vocab])
    tframe = pd.DataFrame(vecs, index=vocab)
    total = 0
    correct = 0
    seen_mistakes = set()
    for quad in quads:
        prompt = quad[:3]
        answer = quad[3]
        vector = analogy_func(frame, *prompt)
        similar = similar_to_vec(tframe, vector)
        result = None
        for match in similar.index:
            if match not in prompt:
                result = match
                break
        if result == answer:
            correct += 1
        else:
            if result not in seen_mistakes:
                print(
                    "%s : %s :: %s : [%s] (should be %s)"
                    % (quad[0], quad[1], quad[2], result, answer)
                    )
                seen_mistakes.add(result)
        total += 1
    low, high = proportion_confint(correct, total)
    return pd.Series([correct / total, low, high], index=['acc', 'low', 'high'])
    def build_tokens(self):

        """
        Get a set of whitelisted tokens.

        Returns: set
        """

        tokens = top_n_list('en', self['token_depth'], ascii_only=True)

        return set(tokens)
def remove_stop_words_hebrew_extended(text):
    text = text.split(' ')
    newword = []
    for word in text:
        for sw in hebrew_stopwords_ex:
            from wordfreq import top_n_list
            if word.startswith(sw) and word[len(sw):] in top_n_list(
                    'he', 100000):
                word = word[len(sw):]
                break
        if word not in hebrew_stopwords_ex:
            newword += [word]
    return " ".join(newword)
示例#6
0
def export_conceptnet_to_hyperwords(table, matrix_filename, vocab_filename, nrows):
    vecs = []
    labels = []
    english_labels = [
        standardized_uri('en', item)
        for item in wordfreq.top_n_list('en', nrows * 2, 'large')
    ]
    count = 0
    for label in english_labels:
        if label in table.index:
            labels.append(label.split('/')[-1])
            vecs.append(get_vector(table, label))
            count += 1
            if count >= nrows:
                break
    np.save(matrix_filename, np.vstack(vecs))
    save_index_as_labels(labels, vocab_filename)
示例#7
0
def export_conceptnet_to_hyperwords(table, matrix_filename, vocab_filename, nrows):
    vecs = []
    labels = []
    english_labels = [
        standardized_uri('en', item)
        for item in wordfreq.top_n_list('en', nrows * 2, 'large')
    ]
    count = 0
    for label in english_labels:
        if label in table.index:
            labels.append(label.split('/')[-1])
            vecs.append(get_vector(table, label))
            count += 1
            if count >= nrows:
                break
    np.save(matrix_filename, np.vstack(vecs))
    save_index_as_labels(labels, vocab_filename)
def main():
    top_words = top_n_list('en', 50000)        
    dic = {}
    words_skipped = []
    counter = 0
    for word in tqdm(top_words):
        if counter%50==0:
            with open('data_chinese.json', 'w') as fp:
                json.dump(dic, fp)
            with open('missing_words_chinese.json', 'w') as wp:
                json.dump(words_skipped, wp)
        try:
            dic[word] = '#'+word_to_color(word)
        except:
            print("skip word '{}'".format(word))
            time.sleep(5)
            words_skipped.append(word)
        counter +=1
示例#9
0
def make_clozes(sentence_file: str, language: str, nwords: int,
                max_sentences_per_word: int,
                max_translations_per_sentence: int,
                max_characters: int) -> None:
    sentence_map = OrderedDict()
    word_map = {}

    with open(sentence_file) as fh:
        while True:
            line = fh.readline()
            if line == "":
                break
            sentence_l1, sentence_l2 = line.strip().split("\t")
            if sentence_l2 not in sentence_map:
                sentence_map[sentence_l2] = []
            sentence_map[sentence_l2].append(sentence_l1)
            words = sentence_to_words(sentence_l2)
            for word in words:
                if word not in word_map:
                    # We use an OrderedDict as an ordered set, since the
                    # Python standard library lacks the latter.
                    word_map[word] = OrderedDict()
                if sentence_l2 not in word_map[word]:
                    word_map[word][sentence_l2] = None

    # We request the top n*2 words, to be sure of having n words left
    # after filtering out the undesirable ones.
    top_n_words = filter_word_list(wordfreq.top_n_list(language, nwords * 2),
                                   nwords)

    writer = csv.writer(sys.stdout, quoting=csv.QUOTE_ALL)

    for word in top_n_words:
        if word in word_map:
            sentences_l2 = list(word_map[word])[:max_sentences_per_word]
            for sentence_l2 in sentences_l2:
                translation_list = \
                    sentence_map[sentence_l2][:max_translations_per_sentence]
                cloze = make_anki_cloze(sentence_l2, word)
                translations = " / ".join(translation_list)
                if len(cloze) <= max_characters and \
                   len(translations) <= max_characters:
                    writer.writerow(
                        ["<p>%s</p><p>%s</p>" % (cloze, translations)])
示例#10
0
def choose_vocab(quads, vocab_size):
    """
    Google and Bats analogies are not multiple-choice; instead, you're supposed to pick
    the best match out of your vector space's entire vocabulary, excluding the
    three words used in the prompt. The vocabulary size can matter a lot: Set
    it too high and you'll get low-frequency words that the data set wasn't
    looking for as answers. Set it too low and the correct answers won't be
    in the vocabulary.

    Set vocab_size='cheat' to see the results for an unrealistically optimal
    vocabulary (the vocabulary of the set of answer words).
    """
    if vocab_size == 'cheat':
        vocab = [
            standardized_uri('en', word)
            for word in sorted(set([quad[3] for quad in quads]))
        ]
    else:
        vocab = [
            standardized_uri('en', word)
            for word in wordfreq.top_n_list('en', vocab_size)
        ]
    return vocab
示例#11
0
"""
Module with some text processing functions used by the wrappers.
"""
import re
from typing import Tuple
from wordfreq import top_n_list

TOKENS_REG = re.compile(r"(?u)\b\w+\b")
STOP_WORDS = set(top_n_list("en", 50,
                            wordlist='best'))  # 50 most common stop words


def locate_short_forms(note: str,
                       short_form_list: list) -> Tuple[list, list, list]:
    """
    Find if token in the short forms list, store token, span and location.
    If the token happen to be a short form and a stop word, it won't get counted.
    """
    locations: list = []
    short_forms_intext: list = []
    span: list = []
    for i, token in enumerate(TOKENS_REG.finditer(note)):
        if token.group() in short_form_list and token.group(
        ) not in STOP_WORDS:
            locations.append(i)
            short_forms_intext.append(token.group())
            span.append(token.span())

    return short_forms_intext, span, locations

示例#12
0
import os
import bz2

from wordfreq import top_n_list
from difflib import SequenceMatcher
from collections import namedtuple

from quotes.utils import clean_text

Token = namedtuple('Tuple', [
    'token',
    'char1',
    'char2',
])

blacklist = set(top_n_list('en', 200))


class Text:
    @classmethod
    def from_stacks(cls, path: str):
        """
        Read from a Stacks JSON file.
        """

        with bz2.open(path, 'rt') as fh:

            metadata = json.loads(fh.read())

            text = metadata.pop('plain_text')
示例#13
0
    'ms',  # Malay
    'nb',  # Norwegian
    'fa',  # Persian
    'pl',  # Polish
    'pt',  # Portuguese
    'ro',  # Romanian
    'ru',  # Russian
    'sr',  # Serbian
    'es',  # Spanish
    'sv',  # Swedish
    'tr',  # Turkish
    'uk',  # Ukrainian
]

for language_code in language_codes:
    print('Processing %s' % language_code)

    file_name = 'frequency-lists/%s-freq.txt' % language_code
    top_n = top_n_list(language_code, 1000000000)
    with open(file_name, 'w') as file:
        for word in top_n:
            file.write('%s\n' % word)
        file.close()

    file_name_2000 = 'frequency-lists-2000/%s-freq-2000.txt' % language_code
    top_2000 = top_n_list(language_code, 2000)
    with open(file_name_2000, 'w') as file_2000:
        for word in top_2000:
            file_2000.write('%s\n' % word)
        file_2000.close()
示例#14
0
    def __iter__(self):
        words = top_n_list(lang='en', n=self._num_examples)

        for w in words:
            image = self.create_image(w)
            yield image, w
示例#15
0
"""
A quick script to output the top N words (1000 for now) in each language.
You can send the output to a file and diff it to see changes between wordfreq
versions.
"""
import wordfreq


N = 1000

if __name__ == '__main__':
    for lang in sorted(wordfreq.available_languages()):
        for word in wordfreq.top_n_list(lang, 1000):
            print('{}\t{}'.format(lang, word))
示例#16
0
 def __init__(self, nWords):
     self.__nWords = nWords
     self.__words = top_n_list('en', self.__nWords, wordlist='large')
     #		sys.stdout.buffer.write(str(self.__words).encode('utf-8'))
     self.__frequencies = numpy.array(
         [word_frequency(w, 'en') for w in self.__words])
示例#17
0
文件: data.py 项目: HTY2003/KiBoard
import json
from wordfreq import zipf_frequency, top_n_list

jsondict = {
    word: zipf_frequency(word, "en")
    for word in top_n_list('en', 20000)
}

with open('english_zipf.json', 'w') as outfile:
    json.dump(jsondict, outfile)
示例#18
0
"""
A quick script to output the top N words (1000 for now) in each language.
You can send the output to a file and diff it to see changes between wordfreq
versions.
"""
import wordfreq

N = 1000

for lang in sorted(wordfreq.available_languages()):
    for word in wordfreq.top_n_list(lang, 1000):
        print('{}\t{}'.format(lang, word))

#data_path = '/Users/chenfish/Desktop/Thesis/Project/data/mt_pe/dev/'

data_path =  '/Users/yuwen/Desktop/Thesis/Project/data/ht_pe/all_no_split/mtht/'
print(data_path)
print('We are working on 5000 word rank.')

for i in os.listdir(data_path): 
    
    if i[-2:] == 'en':

        data = pd.read_pickle(data_path + i)
        print('Now we are working on', i)
        
        top_rank = top_n_list('en', 5000)
        
        
    elif i[-2:] == 'de':

        data = pd.read_pickle(data_path + i)
        print('Now we are working on', i)
        
        top_rank = top_n_list('de', 5000)
                    
    elif i[-2:] == 'ru': 
        data = pd.read_pickle(data_path + i)
        print('Now we are working on', i)
        
        top_rank = top_n_list('ru', 5000)
        
示例#20
0
import json
import numpy as np
import re
from collections import Counter
import pickle
import torch
import torchtext.vocab as vocab
glove = vocab.GloVe(name='840B', dim=300)
from wordfreq import word_frequency, top_n_list
import time
# print(real_word, get_word(real_word))

top_words = top_n_list('en', 200000)


def get_word(word):
    return glove.vectors[glove.stoi[word]].numpy()


def word2vec(pkl_file):
    _file = open(pkl_file, "rb")
    data = pickle.load(_file)
    zero_embed = np.zeros(300)
    for d in data:
        print(d)
        json_file = d.replace('json', 'pkl')
        tmp_file = open(json_file, "rb")
        json_words = pickle.load(tmp_file)
        t1 = time.time()
        embeddings = []
        for real_word in json_words:
示例#21
0
def eval_google_analogies(vectors,
                          subset='semantic',
                          vocab_size=200000,
                          verbose=False):
    """
    Evaluate the Google Research analogies, released by Mikolov et al. along
    with word2vec.

    These analogies come in two flavors: semantic and syntactic. Numberbatch
    is intended to be a semantic space, so we focus on semantic analogies.

    The syntactic analogies are about whether you can inflect or conjugate a
    particular word. The semantic analogies are about whether you can sort
    words by their gender, and about geographic trivia.

    I (Rob) think this data set is not very representative, but evaluating
    against it is all the rage.

    These analogies are not multiple-choice; instead, you're supposed to pick
    the best match out of your vector space's entire vocabulary, excluding the
    three words used in the prompt. The vocabulary size can matter a lot: Set
    it too high and you'll get low-frequency words that the data set wasn't
    looking for as answers. Set it too low and the correct answers won't be
    in the vocabulary.

    Set vocab_size='cheat' to see the results for an unrealistically optimal
    vocabulary (the vocabulary of the set of answer words).
    """
    filename = get_support_data_filename(
        'google-analogies/{}-words.txt'.format(subset))
    quads = read_google_analogies(filename)
    if vocab_size == 'cheat':
        vocab = [
            standardized_uri('en', word)
            for word in sorted(set([quad[3] for quad in quads]))
        ]
    else:
        vocab = [
            standardized_uri('en', word)
            for word in wordfreq.top_n_list('en', vocab_size)
        ]
    vecs = np.vstack([vectors.get_vector(word) for word in vocab])
    tframe = pd.DataFrame(vecs, index=vocab)
    total = 0
    correct = 0
    seen_mistakes = set()
    for quad in quads:
        prompt = quad[:3]
        answer = quad[3]
        result = best_analogy_3cosmul(vectors, tframe, *prompt)
        if result == answer:
            correct += 1
        else:
            if verbose and result not in seen_mistakes:
                print("%s : %s :: %s : [%s] (should be %s)" %
                      (quad[0], quad[1], quad[2], result, answer))
                seen_mistakes.add(result)
        total += 1
    low, high = proportion_confint(correct, total)
    result = pd.Series([correct / total, low, high],
                       index=['acc', 'low', 'high'])
    if verbose:
        print(result)
    return result
示例#22
0
import string
import wordfreq

VALID_LETTERS = string.ascii_uppercase + "'"

for word in wordfreq.top_n_list('en', 1000000, 'best'):
    word = word.upper()
    if all(ch in VALID_LETTERS for ch in word):
        freq = int(wordfreq.word_frequency(word, 'en', 'best') * 1e9) - 9
        if freq > 0:
            print("{},{}".format(word, freq))
示例#23
0
 def get_most_common(lang):
     """
     Return the single most common word in the language.
     """
     return top_n_list(lang, 1)[0]
示例#24
0
 def get_most_common(lang):
     """
     Return the single most common word in the language.
     """
     return top_n_list(lang, 1)[0]
示例#25
0
"""
A quick script to output the top N words (500 for now) in each language.
You can send the output to a file and diff it to see changes between wordfreq
versions.
"""
import wordfreq

N = 500

if __name__ == '__main__':
    for lang in sorted(wordfreq.available_languages()):
        for word in wordfreq.top_n_list(lang, N):
            print('{}\t{}'.format(lang, word))
示例#26
0
def get_top_n(lang, start=0, end=100):
    if language_supported(lang):
        top_n = top_n_list(CODES[lang], end, wordlist='best')
        top_n = top_n[start:end]
        return top_n
示例#27
0
import string
import wordfreq

VALID_LETTERS = string.ascii_uppercase + "'"

for word in wordfreq.top_n_list('en', 1000000, 'large'):
    word = word.upper()
    if all(ch in VALID_LETTERS for ch in word):
        freq = int(wordfreq.word_frequency(word, 'en', 'large') * 1e9)
        print("{},{}".format(word, freq))
示例#28
0
from django.core import serializers

from .models import Question, Answer, Score
from .serializers import QuestionSerializer, UserSerializer, AnswerSerializer
from rest_framework.decorators import api_view
from rest_framework.response import Response
from rest_framework.reverse import reverse
from rest_framework import generics, permissions, renderers
import json
from django.db.models import F, Func
from django.db.models import Q
from .tokenIDHelper import *
from wordfreq import top_n_list
from .sentencesVerif import calc_distance

freq_list = top_n_list('en', 10000, wordlist='best')
THRESHOLD = 3.6  # threshold for the GateKeeper classifier - important!


@api_view(['GET'])
def api_root(request, format=None):
    """
    this function defines the root API of the project
    """
    return Response({
        'users':
        reverse('Users-list', request=request, format=format),
        'Questions':
        reverse('Questions-list', request=request, format=format)
    })
示例#29
0
def get_vocab(language):
    words = list(wordfreq.top_n_list(language, 100000))
    return set(words[100:])
示例#30
0
import wordfreq
import math
import json

wordlist = [(w, round(math.log(wordfreq.word_frequency(w, 'en')), 4))
            for w in wordfreq.top_n_list("en", 25000)]

with open("wordfreq-en-25000-log.json", "w") as fh:
    json.dump(wordlist, fh, indent=2)
示例#31
0
# uses wordfreq==2.3.2
import argparse

import wordfreq

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("languages", nargs="+")
    parser.add_argument("--out-pattern", type=str)
    parser.add_argument("--top_n", type=int, default=1_000)

    args = parser.parse_args()

    for language in args.languages:
        with open(args.out_pattern.format(language), "w") as f:
            # wordfreq lists lowercase all words
            for word in wordfreq.top_n_list(language, args.top_n):
                f.write(word + "\n")
                f.write(word.title() + "\n")
示例#32
0
import click
import re

from wordfreq import top_n_list

from twitter.utils import get_spark

whitelist = set(top_n_list('en', 10000))


def tokenize_tweet(text):
    """Tokenize tweet text.
    """
    # Remove URLs.
    text = re.sub('http\S+', '', text)

    return re.findall('[a-z0-9#@]+', text.lower())


def count_tokens(tweet):
    """Generate (token, minute) keys.
    """
    for token in tokenize_tweet(tweet.text):
        if token in whitelist:
            yield ((tweet.key, token), 1)


@click.command()
@click.option('--src', default='data/states.parquet')
@click.option('--dest', default='data/state-word-counts.json')
def main(src, dest):
示例#33
0
def parse_swedish():

    czech = read_languages_file('czech.txt')
    english = read_languages_file('english.txt')
    french = read_languages_file('french.txt')
    italian = read_languages_file('italian.txt')
    spanish = read_languages_file('spanish.txt')

    requested_word_count = 5000
    swedish_unfiltered = top_n_list('sv', requested_word_count)
    min_word_length = 3
    max_word_length = 8

    good_length = lambda w: len(w) >= min_word_length and len(
        w) <= max_word_length

    used_by = lambda words_set, word: word in words_set
    # used_by_other_languages = lambda w: used_by(czech, w) or used_by(english, w) or used_by(french, w) or used_by(italian, w) or used_by(spanish, w)

    swedish = []

    # list(filter(lambda w:
    # 	good_length(w) and not used_by_other_languages(w), swedish))

    for word in swedish_unfiltered:

        if not word.isalpha():
            print("🔠 skipped word: '{}', not alpha".format(word))
            continue

        if not good_length(word):
            continue

        if used_by(czech, word):
            print("🇨🇿 skipped word: '{}', used by Czech".format(word))
            continue

        if used_by(english, word):
            print("🇬🇧 skipped word: '{}', used by English".format(word))
            continue

        if used_by(french, word):
            print("🇫🇷 skipped word: '{}', used by French".format(word))
            continue

        if used_by(italian, word):
            print("🇮🇹 skipped word: '{}', used by Italian".format(word))
            continue

        if used_by(spanish, word):
            print("🇪🇸 skipped word: '{}', used by Spanish".format(word))
            continue

        swedish.append(word)

    with open("output_swedish.txt", "w") as text_file:
        print(f"{swedish}", file=text_file)

    print(
        "🇸🇪  Outputted #{} words, after having filtered out #{}".format(
            len(swedish), (requested_word_count - len(swedish))))
示例#34
0
from docopt import docopt
from gensim.models.doc2vec import TaggedDocument
from gensim.models.doc2vec import Doc2Vec
from lxml import html
from lxml.html.clean import clean_html
from sklearn import svm
from sklearn import preprocessing
import snowballstemmer
from wordfreq import top_n_list

daiquiri.setup(level=logging.INFO, outputs=("stderr", ))
log = logging.getLogger(__name__)

stem = snowballstemmer.stemmer("english").stemWord
GARBAGE_TO_SPACE = dict.fromkeys((ord(x) for x in punctuation), " ")
STOP_WORDS = set(top_n_list("en", 800))

WORD_MIN_LENGTH = 2
WORD_MAX_LENGTH = 64  # sha2 length


def sane(word):
    return WORD_MIN_LENGTH <= len(word) <= WORD_MAX_LENGTH


def string2words(string):
    """Converts a string to a list of words.

    Removes punctuation, lowercase, words strictly smaller than 2 and strictly bigger than 64
    characters
示例#35
0
#Written by Bernardo Rodrigues ([email protected])
#Based on Luminoso Insight's wordfreq module (https://github.com/LuminosoInsight/wordfreq/)

from wordfreq import word_frequency
from wordfreq import zipf_frequency
from wordfreq import top_n_list
#import matplotlib.pyplot as plt

dest = "./wordlists/"

ar = top_n_list('ar', 1e5, wordlist='large')
de = top_n_list('de', 1e5, wordlist='large')
en = top_n_list('en', 1e5, wordlist='large')
es = top_n_list('es', 1e5, wordlist='large')
fi = top_n_list('fi', 1e5)
fr = top_n_list('fr', 1e5, wordlist='large')
hi = top_n_list('hi', 1e5)
it = top_n_list('it', 1e5, wordlist='large')
ja = top_n_list('ja', 1e5)
nl = top_n_list('nl', 1e5, wordlist='large')
sv = top_n_list('sv', 1e5)
pt = top_n_list('pt', 1e5, wordlist='large')
zh = top_n_list('zh', 1e5)

#---------------------------------------------------------------
arPopular = open(dest + '/arPopular.txt', 'w')
arLongTail = open(dest + '/arLongTail.txt', 'w')

integral100 = 0
for i in range(len(ar)):
    integral100 += word_frequency(ar[i], 'ar')