def test_sentence(self): assert list( textparser.word_tokenize( text='Life is about making an impact, not making an income.', )) == [('life', ), ('is', ), ('about', ), ('making', ), ('an', ), ('impact', ), ('not', ), ('making', ), ('an', ), ('income', )]
def test_ignores_stopwords(self): assert list( textparser.word_tokenize( text='The first rule of python is', stopwords=set(['the', 'of', 'is']), min_length=1, )) == [('first', ), ('rule', ), ('python', )]
def test_sentence(self): assert list(textparser.word_tokenize( text='Life is about making an impact, not making an income.', )) == [ ('life', ), ('is', ), ('about', ), ('making', ), ('an', ), ('impact', ), ('not', ), ('making', ), ('an', ), ('income', ) ]
def get_data_from_collection(self): if os.path.exists(self.env_dir + self.dir) and len( os.listdir(self.env_dir + self.dir)) > 0 and len( self.collection) > 0: index = hashedindex.HashedIndex() doc_count = 0 with io.open(self.env_dir + self.dir + self.collection, 'r', encoding='utf8') as fp: for line in fp.readlines(): for term in textparser.word_tokenize(line, min_length=2, ignore_numeric=True): time.sleep(1) index.add_term_occurrence( term, self.collection + "/line-" + str(doc_count)) self.docnames.append(self.collection + "/line-" + str(doc_count)) doc_count = doc_count + 1 # Esto es una PoC para ver si es que se genera efectivamente una matriz de 1's y 0's con las incidencias for doc in self.docnames: aux_doc = [] for term in index.terms(): if round(index.get_term_frequency(term, doc)) > 0: aux_doc.append(1) else: aux_doc.append(0) self.matrix.append(aux_doc) self.matrix = np.matrix(self.matrix) # Esto es para crear el array de términos for term in index.terms(): self.terms.append(re.sub("(\(\'|\'\,\))", "", str(term))) else: print("Attempting to create '{}' into {}.".format( self.dir, self.env_dir)) if not os.path.exists(self.env_dir + self.dir): os.mkdir(self.env_dir + self.dir, mode=777) print( "The input folder, '{}', was created successfully in {}.". format(self.dir, self.env_dir)) else: print("The input folder, '{}', is empty in {}.".format( self.dir, self.env_dir)) return self.matrix, self.docnames, self.terms
def get_data_from_input(self): if os.path.exists(self.env_dir + self.dir) and len( os.listdir(self.env_dir + self.dir)) > 0: self.docnames = [ f for f in listdir(self.env_dir + self.dir) if isfile(join(self.env_dir + self.dir, f)) ] index = hashedindex.HashedIndex() for doc in self.docnames: with io.open(self.env_dir + self.dir + doc, 'r', encoding='utf8') as fp: text = re.sub('(\t\n|\t|\n|_)', " ", fp.read()) for term in textparser.word_tokenize(text, min_length=2, ignore_numeric=True): index.add_term_occurrence(term, doc) # Esto es una PoC para ver si es que se genera efectivamente una matriz de 1's y 0's con las incidencias for doc in self.docnames: aux_doc = [] for term in index.terms(): if round(index.get_term_frequency(term, doc)) > 0: aux_doc.append(1) else: aux_doc.append(0) self.matrix.append(aux_doc) self.matrix = np.matrix(self.matrix) # Esto es para crear el array de términos for term in index.terms(): self.terms.append(re.sub("(\(\'|\'\,\))", "", str(term))) else: print("Attempting to create '{}' into {}.".format( self.dir, self.env_dir)) if not os.path.exists(self.env_dir + self.dir): os.mkdir(self.env_dir + self.dir, mode=777) print( "The input folder, '{}', was created successfully in {}.". format(self.dir, self.env_dir)) else: print("The input folder, '{}', is empty in {}.".format( self.dir, self.env_dir)) return self.matrix, self.docnames, self.terms
def preprocess_tweet(self, text): text = text.lower() if not self.config.remove_hash_tags_and_mentions: stripped = re.sub(r'\burl\b', '', text) else: stripped = re.sub(r'\burl\b', '', text) stripped = re.sub(r'(\b|\s)([@#][\w_-]+)', '', stripped) tokens = list( map( lambda x: x[0], textparser.word_tokenize( stripped, stopwords.words('english') if self.config.remove_stopwords else []))) return tokens
def test_ngrams(self): assert list( textparser.word_tokenize( text='foo bar bomb blar', ngrams=2, )) == [('foo', 'bar'), ('bar', 'bomb'), ('bomb', 'blar')]
def test_ignores_numeric(self): assert list(textparser.word_tokenize(text='one two 3 four', )) == [ ('one', ), ('two', ), ('four', ) ]
def test_min_length(self): assert list( textparser.word_tokenize( text='one for the money two for the go', min_length=4, )) == [('money', )]
def test_splits_punctuation(self): assert list(textparser.word_tokenize(text='first. second', )) == [ ('first', ), ('second', ) ]
def test_ngrams(self): assert list(textparser.word_tokenize( text='foo bar bomb blar', ngrams=2, )) == [('foo', 'bar'), ('bar', 'bomb'), ('bomb', 'blar')]
def test_ignores_numeric(self): assert list(textparser.word_tokenize( text='one two 3 four', )) == [('one', ), ('two', ), ('four', )]
def test_min_length(self): assert list(textparser.word_tokenize( text='one for the money two for the go', min_length=4, )) == [('money', )]
def test_ignores_stopwords(self): assert list(textparser.word_tokenize( text='The first rule of python is', stopwords=set(['the', 'of', 'is']), min_length=1, )) == [('first', ), ('rule', ), ('python', )]
def test_splits_punctuation(self): assert list(textparser.word_tokenize( text='first. second', )) == [('first', ), ('second', )]