def preprocess_text(docs):
    num_task = os.cpu_count()
    len_slices = len(docs) // num_task
    remainder_slices = len(docs) % num_task

    texts = []
    stoplist = set(stopwords.words('english'))
    
    wn.ensure_loaded()
    t_start = time.perf_counter()
    with ProcessPoolExecutor(max_workers=num_task) as executor:

        futures_tokenize = []
        for n in range(0, num_task):

            upper_bound = (n+1) * len_slices
            if n == num_task - 1:
                upper_bound = (n+1) * len_slices + remainder_slices

            print(n, upper_bound)
            futures_tokenize.append(executor.submit(preprocess_tokenize, docs[n * len_slices:upper_bound],
                            stoplist))

        for future in concurrent.futures.as_completed(futures_tokenize):
            texts += future.result()

    t_stop = time.perf_counter()
    print("removed stopwords and lemmatized in {} s".format(t_stop - t_start))
    # Add bigrams and trigrams to docs (only ones that appear 20 times or more).
    bigram = Phraser(Phrases(texts, min_count=20))
    for idx in range(len(texts)):
        for token in bigram[texts[idx]]:
            if '_' in token:
                # Token is a bigram, add to document.
                texts[idx].append(token)

    print("Done bigrams")
    dictionary = Dictionary(texts)
    dictionary.filter_extremes(no_below=30, no_above=0.5)
    dictionary.filter_tokens(bad_ids=[dictionary.token2id["like"]])
    special_tokens = {'_pad_': 0}
    dictionary.patch_with_special_tokens(special_tokens)

    return texts, dictionary
예제 #2
0
def further_preprocessing_phase(temp_data_frame):
    temp_data_frame['text'] = temp_data_frame['text'].apply(lambda text: th.tokenize_text(text) if text != None else '')
    # textlist = temp_data_frame['text'].to_numpy()
    textlist = temp_data_frame['text'].tolist()

    # if it raises an exeption could be the empty texts
    patent_dictionary = Dictionary(textlist)
    corpus = [patent_dictionary.doc2bow(text) for text in textlist]

    print('original dictionary size: ', len(patent_dictionary))

    vocab_tf={}
    for i in corpus:
        for item, count in dict(i).items():
            if item in vocab_tf:
                vocab_tf[item]+=int(count)
            else:
                vocab_tf[item] =int(count)

    remove_ids=[]
    no_of_ids_below_limit=0
    for id,count in vocab_tf.items():
        if count<=5:
            remove_ids.append(id)
    patent_dictionary.filter_tokens(bad_ids=remove_ids)

    patent_dictionary.filter_extremes(no_below=0)
    patent_dictionary.filter_n_most_frequent(30)

    print('parsed dictionary size: ', len(patent_dictionary))

    vocabulary = list(patent_dictionary.token2id.keys())

    ids_list = []
    data_frame = pd.DataFrame(columns=['patent_id', 'text', 'classification'])
    temp_data_frame.apply(lambda row : shrink_vocabulary(row, vocabulary, data_frame, ids_list), axis=1)
    print(len(ids_list))
    data_frame.set_index(data_frame['patent_id'], inplace=True)
    data_frame.drop(ids_list, axis=0, inplace=True)
    return data_frame
예제 #3
0
class Corpora(Loader):
    """

    """
    is_built = False

    def __init__(self,
                 data_path: str,
                 prefix: str = None,
                 iterator: str = 'token',
                 parsing: str = 'simple',
                 word_up_limit: float = 0.75,
                 word_low_limit: int = 20,
                 dictionary: str = None,
                 shuffle: bool = False,
                 seed: int = 42,
                 document_minimum_length: int = 5,
                 stopwords: str = None):

        iter_map = dict(token=self.tokenize,
                        bow=self.bowize,
                        sentences=self.sentences)
        self.iterator = iter_map[iterator]

        self.word_low_limit = word_low_limit
        self.word_up_limit = word_up_limit

        if stopwords:
            self.stopwords = [w.strip() for w in open(stopwords).readlines()]
        else:
            self.stopwords = []

        if not dictionary:
            self.dictionary = Dictionary()
        else:
            self.dictionary = Dictionary.load_from_text(dictionary)
            if self.stopwords:
                self.dictionary.filter_tokens(
                    bad_ids=self.dictionary.doc2idx(self.stopwords))
            self.is_built = True

        self.shuffle = shuffle
        if self.shuffle:
            np.random.seed(seed)

        self.document_minimum_length = document_minimum_length

        corpus = self.init_corpus(data_path, prefix, parsing)

        super(Corpora, self).__init__(corpus=corpus)

    def __enter__(self):
        if not self.is_built:
            self.build()

        return super(Corpora, self).__enter__()

    def __exit__(self, *args):
        self.clear()
        return super(Corpora, self).__exit__(*args)

    def __iter__(self):
        for v in self.iterator():
            yield v

    def __getitem__(self, key):
        return self.iterator(index=key)

    def init_corpus(self, path: str, prefix: str, parsing: str):
        """

        """
        directory = [os.path.join(path, f) for f in os.listdir(path)]
        folders = list(filter(lambda p: os.path.isdir(p), directory))
        if prefix:
            folders = list(filter(lambda p: prefix in p, folders))

        corpus = [Corpus(path=p, parsing=parsing).load() for p in folders]
        self.__paths = {c.path: c for c in corpus}

        return corpus

    def load_vectors(self, path: str):
        """

        """
        if not path.endswith('.csv'):
            raise AssertionError(
                'Asserted the vectors to be provided with csv.')
        #TODO Use dask in case of too large word vector maps.
        return pd.read_csv(path)

    def build(self):
        """

        """
        if self.is_built:
            logging.warn('Attempted to build already built Corpora.')
            return

        for c in self.corpus:
            self.dictionary.add_documents(c.tokens)
            c.clear()

        self.dictionary.filter_extremes(no_below=self.word_low_limit,
                                        no_above=self.word_up_limit)

        return self

    def clear(self):
        """

        """
        self.dictionary = Dictionary()

    def bowize(self, index=None):
        """

        """
        N = len(self)

        iterable = self._iterator(index)

        for idx in self._indices(iterable):
            corpus = iterable[idx]
            tokens = corpus.tokens

            for ind in self._indices(tokens):
                doc_tokens = tokens[ind]
                bow = self.dictionary.doc2bow(doc_tokens)
                if len(bow) > self.document_minimum_length:
                    yield bow, N
                else:
                    logging.warn(
                        f'Received empty file at {corpus.documents[ind]}, skipping.'
                    )
                    corpus.mark_empty(ind)
            corpus.clear()

    def tokenize(self, index=None):
        """

        """
        N = len(self)

        iterable = self._iterator(index)

        for idx in self._indices(iterable):
            corpus = iterable[idx]
            tokens = corpus.tokens
            self._move()
            for ind in self._indices(tokens):
                doc_tokens = tokens[ind]
                if len(doc_tokens) > self.document_minimum_length:
                    yield doc_tokens, N
                else:
                    logging.warn(
                        f'Received empty file at {corpus.documents[ind]}, skipping.'
                    )
                    corpus.mark_empty(ind)
            corpus.clear()

    def sentences(self, index=None):
        """

        """
        iterable = self._iterator(index=index)
        for ind in self._indices(iterable=iterable):
            corpus = iterable[ind]
            for sentence in corpus.sentences:
                if len(sentence) > self.document_minimum_length:
                    yield sentence
                else:
                    logging.warn(
                        f'Received empty file at {corpus.documents[ind]}, skipping.'
                    )

    def documents(self, index=None):
        """

        """
        for c in self.corpus:
            if len(c) > 1:
                yield c.documents
            else:
                for doc in c.documents:
                    yield doc

    @property
    def years(self):
        """

        """
        return sorted([int(c.year) for c in self.corpus])

    def _iterator(self, index=None):
        """

        """
        iterator = self.corpus
        if index:
            if isinstance(index, int):
                iterator = [self.corpus[index]]  #TODO: Handle indices as slice
            elif isinstance(index, str):
                iterator = [self.__paths[index]]
        return iterator

    def _indices(self, iterable):
        """

        """
        if self.shuffle:
            indices = np.random.permutation(len(iterable))
        else:
            indices = range(len(iterable))
        return indices