Python tokenize примеры, utility.tokenize Python примеры использования

Пример #1

0

Показать файл

Файл: abk.py Проект: decretist/Delta

def main():
    filenames = ['Gratian0.txt', 'Gratian1.txt', 'Gratian2.txt']
    tokens = []
    for filename in filenames:
        tokens += u.tokenize('./corpus/' + filename)
    frequencies = u.frequencies(tokens)
    stop = 31  # Figure_Zy
    # stop = len(frequencies) # Figure_Zz
    words = []
    occurences = []
    for b in range(1, stop):  # Number of Occurences (b)
        a = 0  # Number of Words (a)
        for value in list(frequencies.values()):
            if value == b: a += 1
        k = a * b * b  # ab^2 = k formulation of Zipf's law
        if a == 0: continue  # log(0) throws ValueError: math domain error
        words.append(a)
        occurences.append(b)
    u.plot_data_scatter(u.logify(zip(words, occurences)))
    slope = u.plot_regression(u.logify(zip(words, occurences)))
    pp.xlabel('Number of Words')
    pp.ylabel('Number of Occurences')
    pp.title('$ab^2 = k$\n(log-log, slope = ' + f'{slope:.4f})')
    pp.savefig('./PNGs/Figure_Zy')
    # pp.savefig('./PNGs/Figure_Zz')
    pp.show()

Пример #2

0

Показать файл

Файл: newbag.py Проект: francescoinfante/datasciencegame

    def extract(self, data):
        tokens = []

        for attr in self.list_of_attributes:
            tokens.extend(tokenize(data[attr], self.ngram_size))

        return {key: 1 for key in tokens}

Пример #3

0

Показать файл

Файл: bagofwordsunitedtfidf.py Проект: francescoinfante/datasciencegame

    def __init__(self, train_sample, test_sample, ngram_size, progressbar=None, normalization=False):
        self.idf = Counter()
        self.ngram_size = ngram_size
        for _, data, _ in train_sample:
            tmp_set = set()
            for token in tokenize(data['title'] + ' ' + data['description'], self.ngram_size):
                tmp_set.add(token)
            for token in tmp_set:
                self.idf[token] += 1
            progressbar.update(progressbar.currval + 1)

        for key in self.idf:
            self.idf[key] = math.log(float(len(train_sample)) / self.idf[key], 2)
        """
        logging.info('BagOfWordsUnitedTFIDF init done')
        logging.info('Total number of attributes ' + str(len(self.idf)))
        logging.info(self.idf)
        """
        self.attributes = dict([(x, 'numeric') for x in self.idf])
        self.normalization = normalization
        if normalization:
            self.max_value = 0.0

            logging.info('Computing max value')

            for _, data, _ in train_sample + test_sample:
                result = self.extract(data, norm=False)
                for _, val in result.iteritems():
                    self.max_value = max(self.max_value, val)

            logging.info('max value: ' + str(self.max_value))

Пример #4

0

Показать файл

Файл: bagofwordsunited.py Проект: francescoinfante/datasciencegame

    def __init__(self, train_sample, at_most=1000):
        cnt = Counter()
        for _, data, _ in train_sample:
            tokens = tokenize(data['title'] + ' ' + data['description'])
            for x in tokens:
                cnt[x] += 1

        self.attributes = dict([(x, 'numeric') for x in cnt.most_common(at_most)])

Пример #5

0

Показать файл

Файл: bagofwordsunited.py Проект: francescoinfante/datasciencegame

    def extract(self, data):
        tokens = tokenize(data['title'] + ' ' + data['description'])

        res = {}
        for x in tokens:
            res[x] = 1

        return res

Пример #6

0

Показать файл

Файл: clustering.py Проект: SnowIsWhite/clothing-analysis

def configure_dictionary(type):
    list_of_words = tokenize(type)
    dic = dictionary()
    for list in list_of_words:
        for word in list:
            dic.put(word)
    dic.rearrange()
    return dic, list_of_words

Пример #7

0

Показать файл

Файл: newbag.py Проект: francescoinfante/datasciencegame

 def __init__(self, train_sample, ngram_size, list_of_attributes, min_frequency, progressbar):
     self.count = Counter()
     self.ngram_size = ngram_size
     self.list_of_attributes = list_of_attributes
     for _, data, _ in train_sample:
         for attr in self.list_of_attributes:
             for token in tokenize(data[attr], self.ngram_size):
                 self.count[token] += 1
         progressbar.update(progressbar.currval + 1)
     self.attributes = {key: 'numeric' for key, value in self.count.iteritems() if value >= min_frequency}

Пример #8

0

Показать файл

def correct(sent, model=model, topK=5, threshold=lprob):
    # print(psutil.virtual_memory())

    sent = delNonAlphabetAndEmpty(tokenize(sent))
    sentences = mergeOne(sent)
    sentences = [replaceEntities(sent) for sent in sentences]
    combi = [list() for i in range(len(sentences))]
    #     print(sentences,sent)

    for i in range(len(sentences)):
        for j in range(len(sentences[i])):
            vocab = spellChecker.spell(sentences[i][j])
            if len(vocab) >= topK:
                combi[i].append(vocab[:topK])
            else:
                combi[i].append(vocab)

#     print(combi)
    cP = [cartesianProduct(c) for c in combi]
    del combi
    prob = list()

    for i in range(len(cP)):
        for j in range(len(cP[i])):
            prob.append((calculate_sentence_ln_prob(list(cP[i][j]),
                                                    model), cP[i][j]))
    del cP

    prob = [p for p in prob if p[0] > threshold]
    #     print(prob)
    if prob != []:
        prob.sort(key=takeFirst, reverse=True)
        #     maxp = prob[:topK]
        ans = prob[0][1]
    else:
        ans = sent
        prob.append((0, False))

    out = []
    for i in range(len(ans)):
        if ans[i] in ['B-TIME', 'B-DATE', 'NUM', "PUNCT"]:
            out.append(sent[i])
        else:
            out.append(ans[i])
    return [(prob[0][0] or float("-inf")), " ".join(out)]


# x = input()
# start = time.time()
# print(correct(x))
# stop = time.time()
# print(stop-start)

Пример #9

0

Показать файл

def get_features(texts, n):
    '''
    Assemble a large corpus made up of texts written by an arbitrary
    number of authors; let’s say that number of authors is x.
    '''
    corpus = []
    for text in texts:
        corpus += u.tokenize(path + text + '.txt')
    '''
    Find the n most frequent words in the corpus to use as features.
    '''
    features = list(u.frequencies(corpus).keys())[:n]
    return features

Пример #10

0

Показать файл

Файл: bagofwordsunitedtfidf.py Проект: francescoinfante/datasciencegame

    def extract(self, data, norm=True):
        tokens = tokenize(data['title'] + ' ' + data['description'], self.ngram_size)
        count = Counter()

        for x in tokens:
            count[x] += 1
        """
        for x in count:
            count[x] /= float(len(tokens))
            count[x] *= self.idf[x]
            if norm and self.normalization:
                count[x] /= self.max_value
        """
        return dict(count)

Пример #11

0

Показать файл

def get_frequencies(features, subcorpora):
    '''
    For each of these n features, calculate the share of each of
    the x authors’ subcorpora represented by this feature, as a
    percentage of the total number of words.
    '''
    frequencies = {}
    empty = dict.fromkeys(features, 0)
    for subcorpus in subcorpora:
        frequencies[subcorpus] = empty.copy()
        subcorpus_tokens = u.tokenize(path + subcorpus + '.txt')
        subcorpus_frequencies = u.frequencies(subcorpus_tokens)
        for feature in features:
            frequencies[subcorpus][feature] = (subcorpus_frequencies.get(
                feature, 0) / len(subcorpus_tokens)) * 1000
    return frequencies

Пример #12

0

Показать файл

def main():
    filenames = ['Gratian0.txt', 'Gratian1.txt', 'Gratian2.txt']
    tokens = []
    for filename in filenames:
        tokens += u.tokenize('./corpus/' + filename)
    print(half(tokens))
    tmp = u.rank_frequencies(u.frequencies(tokens))
    actual = list(tmp.values())
    scale = actual[0][1]
    # 30 is a commonly used number in Burrows's articles
    theoretical = u.zipf_distrib(30, scale)
    figure_za(theoretical)
    figure_zb(theoretical)
    figure_zc(dict(itertools.islice(tmp.items(), 30)))
    figure_zd(actual[0:30])
    figure_ze(actual)

Пример #13

0

Показать файл

def add_test_values(test, features, frequencies, z_scores):
    '''
    Then, calculate the same z-scores for each feature in the text
    for which we want to determine authorship.
    '''
    test_tokens = []
    test_tokens = u.tokenize(path + test + '.txt')
    test_frequencies = u.frequencies(test_tokens)
    frequencies[test] = dict.fromkeys(features, 0)
    z_scores[test] = dict.fromkeys(features, 0)
    for feature in features:
        frequencies[test][feature] = (test_frequencies.get(feature, 0) /
                                      len(test_tokens)) * 1000
        z_scores[test][feature] = (
            frequencies[test][feature] -
            frequencies['means'][feature]) / frequencies['stdevs'][feature]
    return (frequencies, z_scores)

Пример #14

0

Показать файл

def main():
    '''
    Assemble a large corpus made up of texts written by an arbitrary
    number of authors; let’s say that number of authors is x.
    '''
    test = 'cases' # only have to change this one line
    authors = ['cases', 'laws', 'marriage', 'other', 'penance', 'second']
    authors.remove(test)
    corpus = []
    for author in authors:
        corpus += u.tokenize('./corpus/' + author + '.txt')
    '''
    Find the n most frequent words in the corpus to use as features.
    '''
    mfws = list(u.frequencies(corpus).keys())[:30]
    '''
    For each of these n features, calculate the share of each of
    the x authors’ subcorpora represented by this feature, as a
    percentage of the total number of words.
    '''
    corp_f_dict = {}
    empty = dict.fromkeys(mfws, 0)
    for author in authors:
        corp_f_dict[author] = empty.copy()
        subcorpus = u.tokenize('./corpus/' + author + '.txt')
        subcorpus_frequencies = u.frequencies(subcorpus)
        for word in mfws:
            corp_f_dict[author][word] = (subcorpus_frequencies.get(word, 0) / len(subcorpus)) * 1000
        u.write_csv(corp_f_dict, './subcorpus_frequencies.csv')
    '''
    Then, calculate the mean and the standard deviation of these x
    values and use them as the offical mean and standard deviation
    for this feature over the whole corpus. In other words, we will
    be using a mean of means instead of calculating a single value
    representing the share of the entire corpus represented by each
    word.
    '''
    means = empty.copy()
    stdevs = empty.copy()
    for word in mfws:
        corp_f_list = []
        for author in authors:
            corp_f_list.append(corp_f_dict[author][word])
        means[word] = statistics.mean(corp_f_list)
        stdevs[word] = statistics.stdev(corp_f_list)
    '''
    For each of the n features and x subcorpora, calculate a z-score
    describing how far away from the corpus norm the usage of this
    particular feature in this particular subcorpus happens to be.
    To do this, subtract the "mean of means" for the feature from
    the feature’s frequency in the subcorpus and divide the result
    by the feature’s standard deviation.
    '''
    corp_z_dict = {}
    for author in authors:
        corp_z_dict[author] = empty.copy()
        for word in mfws:
            corp_z_dict[author][word] = (corp_f_dict[author][word] - means[word]) / stdevs[word]
    '''
    Then, calculate the same z-scores for each feature in the text
    for which we want to determine authorship.
    '''
    test_tokens = []
    test_tokens = u.tokenize('./corpus/' + test + '.txt')
    test_frequencies = u.frequencies(test_tokens)
    test_f_dict = test_z_dict = empty.copy()
    for word in mfws:
       test_f_dict[word] = (test_frequencies.get(word, 0) / len(test_tokens)) * 1000
       # can collapse this into one loop
       test_z_dict[word] = (test_f_dict[word] - means[word]) / stdevs[word]
    print(test_z_dict)
    '''
    Finally, calculate a delta score comparing the anonymous paper
    with each candidate’s subcorpus. To do this, take the average
    of the absolute values of the differences between the z-scores
    for each feature between the anonymous paper and the candidate’s
    subcorpus. (Read that twice!) This gives equal weight to each
    feature, no matter how often the words occur in the texts;
    otherwise, the top 3 or 4 features would overwhelm everything
    else.
    '''
    for author in authors:
        sum = 0
        for word in mfws:
            sum += math.fabs(corp_z_dict[author][word] - test_z_dict[word])
        delta = sum / len(mfws)
        print(test + "-" + author + " delta: " + str(delta))

Python tokenize примеры использования