Exemplo n.º 1
0
def run(corpus_file_object=None, keep_case=False, max_word_tokens=0):

    unigrams_counter = Counter()
    bigrams_counter = Counter()
    trigrams_counter = Counter()

    current_word_token_count = 0

    for line in corpus_file_object:
        if max_word_tokens and current_word_token_count > max_word_tokens:
            break

        line = fix_punctuations(line).strip()

        if not keep_case:
            line = line.casefold()

        words = line.split()
        if not words:
            continue

        current_word_token_count += len(words)

        unigrams_of_line = words
        bigrams_of_line = zip(*[words[i:] for i in range(2)])
        trigrams_of_line = zip(*[words[i:] for i in range(3)])

        unigrams_counter.update(unigrams_of_line)
        bigrams_counter.update(bigrams_of_line)
        trigrams_counter.update(trigrams_of_line)

    return dict(unigrams_counter), dict(bigrams_counter), dict(trigrams_counter)
Exemplo n.º 2
0
def run(corpus_file_object=None, keep_case=False, max_word_tokens=0):

    unigrams_counter = Counter()
    bigrams_counter = Counter()
    trigrams_counter = Counter()

    current_word_token_count = 0

    for line in corpus_file_object:
        if max_word_tokens and current_word_token_count > max_word_tokens:
            break

        line = fix_punctuations(line).strip()

        if not keep_case:
            line = line.lower()

        words = line.split()
        if not words:
            continue

        current_word_token_count += len(words)

        unigrams_of_line = words
        bigrams_of_line = zip(*[words[i:] for i in range(2)])
        trigrams_of_line = zip(*[words[i:] for i in range(3)])

        unigrams_counter.update(unigrams_of_line)
        bigrams_counter.update(bigrams_of_line)
        trigrams_counter.update(trigrams_of_line)

    return (dict(unigrams_counter), dict(bigrams_counter),
            dict(trigrams_counter))
Exemplo n.º 3
0
    def __init__(self,
                 file_path=None,
                 wordlist_file=False,
                 corpus_object=None,
                 wordlist_object=None,
                 encoding=ENCODING,
                 **kwargs):
        self.file_abspath = self._check_file_path(file_path)

        if self.file_abspath is None:
            self.directory = None
        else:
            self.directory = os.path.dirname(self.file_abspath)

        self.file_is_wordlist = wordlist_file
        self.encoding = encoding
        self.corpus_object = corpus_object
        self.wordlist_object = wordlist_object
        self.parameters_ = self._determine_parameters(**kwargs)

        # number of word types and tokens
        self._number_of_word_types = None
        self._number_of_word_tokens = None

        # word ngrams
        self._word_unigram_counter = None
        self._word_bigram_counter = None
        self._word_trigram_counter = None

        # wordlist
        self._wordlist = None
        if self.wordlist_object is not None:
            # self.wordlist_object is
            # either an iterable or a dict of word-count pairs
            if type(self.wordlist_object) is dict:
                word_count_dict = dict()
                if self.parameters_['keep_case']:
                    word_count_dict = self.wordlist_object
                else:
                    for word, count in self.wordlist_object:
                        word = word.lower()
                        if word not in word_count_dict:
                            word_count_dict[word] = 0
                        word_count_dict[word] += count

                self._wordlist = [
                    word_
                    for word_, _ in double_sorted(word_count_dict.items(),
                                                  key=lambda x: x[1],
                                                  reverse=True)
                ]
                self._word_unigram_counter = word_count_dict

            elif hasattr(self.wordlist_object, '__iter__'):
                if self.parameters_['keep_case']:
                    self._wordlist = sorted(set(self.wordlist_object))
                else:
                    self._wordlist = sorted(
                        set(w.lower() for w in self.wordlist_object))
                self._word_unigram_counter = {w: 1 for w in self._wordlist}

            else:
                raise TypeError('wordlist object must be a dict of word-count'
                                'pairs or an iterable of words')

        # corpus file object
        if self.corpus_object is not None:
            # self.corpus_object is either a list of strings or a long str
            if type(self.corpus_object) is list:
                corpus_str = fix_punctuations(' '.join(self.corpus_object))
            elif type(self.corpus_object) is six.text_type:
                corpus_str = fix_punctuations(self.corpus_object)
            else:
                raise TypeError('corpus object must be either a text or list')
            self.corpus_file_object = StringIO(corpus_str)
        elif self.file_abspath and not self.file_is_wordlist:
            self.corpus_file_object = open(self.file_abspath,
                                           encoding=self.encoding)
        else:
            self.corpus_file_object = None

        # wordlist file object
        if self.file_is_wordlist:
            self.wordlist_file_object = open(self.file_abspath,
                                             encoding=self.encoding)
        else:
            self.wordlist_file_object = StringIO()

        # manifold-related objects
        self._words_to_neighbors = None
        self._words_to_contexts = None
        self._contexts_to_words = None
        self._neighbor_graph = None

        # phon objects
        self._phone_unigram_counter = None
        self._phone_bigram_counter = None
        self._phone_trigram_counter = None

        self._phone_dict = None
        self._biphone_dict = None
        self._word_dict = None
        self._words_to_phones = None

        # trie objects
        self._broken_words_left_to_right = None
        self._broken_words_right_to_left = None
        self._successors = None
        self._predecessors = None

        Lexicon_BiSig.__init__(self, self.wordlist(),
                               self.parameters_['min_stem_length'],
                               self.parameters_['max_affix_length'],
                               self.parameters_['min_sig_count'],
                               self.parameters_['suffixing'])
Exemplo n.º 4
0
    def _initialize(self):
        # number of word types and tokens
        self._number_of_word_types = None
        self._number_of_word_tokens = None

        # word ngrams
        self._word_unigram_counter = None
        self._word_bigram_counter = None
        self._word_trigram_counter = None

        # wordlist
        self._wordlist = None
        if self.wordlist_object is not None:
            # self.wordlist_object is
            # either an iterable or a dict of word-count pairs
            if type(self.wordlist_object) is dict:
                word_count_dict = dict()
                if self.parameters_['keep_case']:
                    word_count_dict = self.wordlist_object
                else:
                    for word, count in self.wordlist_object:
                        word = word.lower()
                        if word not in word_count_dict:
                            word_count_dict[word] = 0
                        word_count_dict[word] += count

                self._wordlist = [word for word, _ in
                                  double_sorted(word_count_dict.items(),
                                                key=lambda x: x[1],
                                                reverse=True)]
                self._word_unigram_counter = word_count_dict

            elif hasattr(self.wordlist_object, '__iter__'):
                if self.parameters_['keep_case']:
                    self._wordlist = sorted(set(self.wordlist_object))
                else:
                    self._wordlist = sorted(
                        set(w.lower() for w in self.wordlist_object))
                self._word_unigram_counter = {w: 1 for w in self._wordlist}

            else:
                raise TypeError('wordlist object must be a dict of word-count'
                                'pairs or an iterable of words')

        # signature-related objects
        self._stems_to_words = None
        self._signatures_to_stems = None
        self._stems_to_signatures = None
        self._words_to_signatures = None
        self._signatures_to_words = None
        self._words_to_sigtransforms = None

        self._signatures = None
        self._affixes_to_signatures = None
        self._words_in_signatures = None
        self._affixes = None
        self._stems = None

        # corpus file object
        if self.corpus_object is not None:
            # self.corpus_object is either a list of strings or a long str
            if type(self.corpus_object) is list:
                corpus_str = fix_punctuations(' '.join(self.corpus_object))
            elif type(self.corpus_object) is str:
                corpus_str = fix_punctuations(self.corpus_object)
            else:
                raise TypeError('corpus object must be either a str or a list')
            self.corpus_file_object = StringIO(corpus_str)
        elif self.file_abspath and not self.file_is_wordlist:
            self.corpus_file_object = open(self.file_abspath,
                                           encoding=self.encoding)
        else:
            self.corpus_file_object = None

        # wordlist file object
        if self.file_is_wordlist:
            self.wordlist_file_object = open(self.file_abspath,
                                             encoding=self.encoding)
        else:
            self.wordlist_file_object = StringIO()

        # manifold-related objects
        self._words_to_neighbors = None
        self._words_to_contexts = None
        self._contexts_to_words = None
        self._neighbor_graph = None

        # phon objects
        self._phone_unigram_counter = None
        self._phone_bigram_counter = None
        self._phone_trigram_counter = None

        self._phone_dict = None
        self._biphone_dict = None
        self._word_dict = None
        self._words_to_phones = None

        # trie objects
        self._broken_words_left_to_right = None
        self._broken_words_right_to_left = None
        self._successors = None
        self._predecessors = None