예제 #1
0
def generate_features(model, df):
    with Progress(*progress_layout, console=console) as progress:
        taskid = progress.add_task(f'Generating features...', total=len(df))

        dict_set_features: Dict[int, set] = {}

        @use_hook(hook_advance(progress, taskid))
        def row_generate_features(row):
            set_feature = set(row['ngram'])
            if row['target'] in dict_set_features:
                dict_set_features[row['target']].update(set_feature)
            else:
                dict_set_features[row['target']] = set_feature

        df.apply(row_generate_features, axis=1)

        count_label = len(dict_set_features)
        if not model.hyper.feature_drop:
            for label in dict_set_features:
                dict_set_features[label] = set(
                    random.sample(
                        list(dict_set_features[label]),
                        int((1 - model.hyper.feature_drop) *
                            len(dict_set_features[label]))))

        features = tuple(set().union(*dict_set_features.values()))

    with Progress(*progress_layout, console=console) as progress:
        taskid = progress.add_task(f'Sampling features...', total=len(df))

        dict_frequency_feature = dict.fromkeys(features, 0)

        @use_hook(hook_advance(progress, taskid))
        def row_count_frequency(ngrams):
            for ngram in ngrams:
                if ngram in dict_frequency_feature:
                    dict_frequency_feature[ngram] += 1

        df['ngram'].apply(row_count_frequency)

        if model.hyper.feature_pick == 'freq':
            features = random.choices(features, k=model.hyper.feature_size)
        elif model.hyper.feature_pick == 'top':
            features = tuple(
                sorted(dict_frequency_feature.keys(),
                       key=lambda x: dict_frequency_feature[x],
                       reverse=True))
            features = features[:model.hyper.feature_size]

    return count_label, np.array(features, dtype='<U45')
 def __test_iterate(self, progress, sample):
     taskid_this_iteration = progress.add_task(f'Iterating...',
                                               total=len(sample))
     calc.calc_prob(self, self.df_test, sample, False,
                    hook_advance(progress, taskid_this_iteration))
     calc.calc_predict(self, self.df_test, sample)
     progress.update(taskid_this_iteration, visible=False)
예제 #3
0
def segmentate(model, df, name_df):
    global dict_lemmatize
    with Progress(*progress_layout, console=console) as progress:
        taskid = progress.add_task(f'Segmentation on {name_df} set...',
                                   total=len(df))

        if model.hyper.lemmatize and dict_lemmatize == None:
            df_lemmatize = pd.read_csv('corpora/lemmatize.csv', sep='\t')
            dict_lemmatize = dict(
                zip(df_lemmatize['word'], df_lemmatize['lemma']))

        # def split_old(string):
        #     '''perform splitting on raw strings while removing unnecessary punctuation
        #     '''
        #     return list(filter(lambda x: bool(re.match(r'\'?\w[\w/\-\'.]+$', x)),
        #         map(
        #             lambda s: s.lstrip('_*').strip('_*,.\n').lower(),
        #             string.split(' ')
        #         )
        #     ))

        def split_new(string):
            '''perform splitting on raw strings while removing unnecessary punctuation, based upon [this source](https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py)
            '''
            string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
            string = re.sub(r"\'s", " \'s", string)
            string = re.sub(r"\'ve", " \'ve", string)
            string = re.sub(r"n\'t", " n\'t", string)
            string = re.sub(r"\'re", " \'re", string)
            string = re.sub(r"\'d", " \'d", string)
            string = re.sub(r"\'ll", " \'ll", string)
            string = re.sub(r",", " , ", string)
            string = re.sub(r"!", " ! ", string)
            string = re.sub(r"\(", " \( ", string)
            string = re.sub(r"\)", " \) ", string)
            string = re.sub(r"\?", " \? ", string)

            string = re.sub(r'_{1,2}(\w+?)_{1,2}', '\g<1>',
                            string)  # MD Punctuation
            string = re.sub(r'\*{1,2}(\w+?)\*{1,2}', '\g<1>', string)
            string = re.sub(r'[\w\-_.]+@[\w\-_.]+', '', string)  # Email

            string = re.sub(r"\s{2,}", " ", string)
            return string.strip().lower().split(' ')

        def lemmatization(lst):
            '''perform lemmatization on word lists, lemmatization list is from [this source](https://github.com/michmech/lemmatization-lists/blob/master/lemmatization-en.txt)
            '''
            return list(map(lambda s: dict_lemmatize.get(s, s), lst))

        @use_hook(hook_advance(progress, taskid))
        def row_segmentate(string):
            lst = split_new(string)
            if model.hyper.lemmatize: lst = lemmatization(lst)
            return lst

        df['data'] = df['data'].apply(row_segmentate)
    def __batch(self, df, progress, sample, set_type):
        if len(sample) == 0: return

        taskid_cache_feature_vector = progress.add_task(
            f'Generating feature vector...', total=len(sample))
        calc.cache_feature_vector(
            self, df, sample,
            hook_advance(progress, taskid_cache_feature_vector))
        calc.cache_target_matrix(self, df, sample)

        if set_type == 'train':
            self.__train_iterate(progress, sample)
        else:
            self.__test_iterate(progress, sample)

        calc.dump_feature_vector(self, df, sample)
        progress.update(taskid_cache_feature_vector, visible=False)
    def __train_iterate(self, progress, sample):
        taskid_iteration = progress.add_task(f'Iterating...',
                                             total=self.hyper.iteration)
        list_taskid_iteration = []
        for iteration in range(self.hyper.iteration):
            taskid_this_iteration = progress.add_task(
                f'Iteration #{iteration+1} of {self.hyper.iteration}...',
                total=len(sample))
            list_taskid_iteration.append(taskid_this_iteration)

            calc.calc_prob(self, self.df_train, sample, True,
                           hook_advance(progress, taskid_this_iteration))
            progress.update(
                taskid_this_iteration,
                description=f'[green]Iteration #{iteration+1} done.')
            progress.advance(taskid_iteration)

        for taskid_this_iteration in list_taskid_iteration:
            progress.update(taskid_this_iteration, visible=False)
        progress.update(taskid_iteration, visible=False)
예제 #6
0
def extract_ngram(model, df, name_df):
    global set_stopwords

    def generate_trigram(arr: List[str]) -> Set[Tuple[str, str, str]]:
        return set(
            map(tuple,
                np.array((arr[:-3], arr[1:-2], arr[2:-1])).T.tolist()))

    def generate_bigram(arr: List[str]) -> Set[Tuple[str, str]]:
        return set(map(tuple, np.array((arr[:-2], arr[1:-1])).T.tolist()))

    def generate_unigram(arr: List[str]):
        return set(tuple(arr))

    if model.hyper.stopword and set_stopwords == None:
        '''remove stopwords in word lists, stopwords list is a joint from [this source](https://code.google.com/archive/p/stop-words/) and [this source](https://web.archive.org/web/20111226085859/http://oxforddictionaries.com/words/the-oec-facts-about-the-language)
        '''

        df_stopwords = pd.read_csv('corpora/stopwords.csv', sep='\t')
        set_stopwords = set(df_stopwords['stopword'])
        set_stopwords.update(
            set(
                itertools.product(df_stopwords['stopword'],
                                  df_stopwords['stopword'])))

    with Progress(*progress_layout, console=console) as progress:
        taskid = progress.add_task(f'Extracting ngrams from {name_df} set...',
                                   total=len(df))

        @use_hook(hook_advance(progress, taskid))
        def row_extract_ngram(arr):
            ngrams = generate_unigram(arr) | generate_bigram(arr)
            if model.hyper.trigram: ngrams = ngrams | generate_trigram(arr)
            if model.hyper.stopword: ngrams = ngrams.difference(set_stopwords)
            return np.array(list(map(str, ngrams)), dtype='<U45')

        df['ngram'] = df['data'].apply(row_extract_ngram)