Python StopWords.StopWords примеры использования

Язык программирования: Python

Пространство имен/Пакет: StopWords

Класс/Тип: StopWords

Метод/Функция: StopWords

Примеров на hotexamples.com: 10

Python StopWords.StopWords - 10 примеров найдено. Это лучшие примеры Python кода для StopWords.StopWords.StopWords, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

StopWords(10)

removeWords(3)

get_stop_words(2)

isStopWord(2)

is_stop_word(1)

Пример #1

Показать файл

Файл: PositionalIndex.py Проект: usmansabir98/Boolean_Information_Retrieval

 def __init__(self):
     self.collection = [['a', 'word', 'a', 'word', 'the'],
                        ['the', 'a', 'brown', 'cat', 'the', 'a'],
                        ['brown', 'cat', 'the', 'a', 'word']]
     self.dictionary = {}
     self.stopWords = StopWords(
         "D:/Information Retrieval/IR/stop words.txt")

Пример #2

Показать файл

Файл: Clean.py Проект: tiffen/ClusType

    def __init__(self, path):
        self.Documents = []
        self.allowed = set([chr(i) for i in xrange(ord('a'), ord('z')+1)]+ \
                [chr(i) for i in xrange(ord('A'), ord('Z')+1)] + \
            #[',','-',' '] + [str(i) for i in xrange(10)])

                [',','.','?','-','!',' '] + [str(i) for i in xrange(10)])
        self.punctuation = [';', ':', '&', '?', "/"]
        self.P = Partition(self.punctuation)
        self.tagger = PatternTagger()
        self.sw = StopWords()
        with open(path, 'r') as f:
            for line in f:
                line = line.strip()
                if line:
                    self.Documents.append(line)

Пример #3

Показать файл

    def __init__(self, path):
        data_home = os.path.split(path)[0]
        self.Documents = []
        self.allowed = set([chr(i) for i in xrange(ord('a'), ord('z')+1)]+ \
                [chr(i) for i in xrange(ord('A'), ord('Z')+1)] + \
            #[',','-',' '] + [str(i) for i in xrange(10)])

                [',','.','?','-','!',' '] + [str(i) for i in xrange(10)])
        punctuation = [';', ':', '&', '?', "/"]

        #P = Partition(punctuation)
        self.tagger = PatternTagger()
        with open(path, 'r') as f:
            for line in f.readlines():
                li = line.split("\t")[1].strip()
                if li:
                    self.Documents.append(li)
        data_Inter_path = os.path.join(data_home, "Intermediate")
        self.inter = data_Inter_path
        self.P = Partition(punctuation, data_Inter_path, data_home)
        self.sw = StopWords(data_home)

Пример #4

Показать файл

    def __init__(self, device='cpu', hyper_params=None):
        sup = super()
        sup.__init__(device=device, hyper_params=hyper_params)
        self.embeddings = nn.ModuleList([
            sup.get_embeddings(key=key, device=device)
            for key in self.hyper_params['embeddings']
        ])

        emb_dim = sum([item.embedding_dim for item in self.embeddings])
        self.hidden_size = emb_dim
        self.f_gru1 = nn.GRU(input_size=emb_dim,
                             hidden_size=emb_dim,
                             batch_first=True)
        self.b_gru1 = nn.GRU(input_size=emb_dim,
                             hidden_size=emb_dim,
                             batch_first=True)
        self.f_gru2 = nn.GRU(input_size=emb_dim,
                             hidden_size=emb_dim,
                             batch_first=True)
        self.b_gru2 = nn.GRU(input_size=emb_dim,
                             hidden_size=emb_dim,
                             batch_first=True)

        self.num_head = hyper_params['num_head']
        self.attention = nn.ModuleList(
            [Attention(dimensions=emb_dim) for _ in range(self.num_head)])

        self.dropout = nn.Dropout(hyper_params['dropout_ratio'])

        self.pooling = nn.AdaptiveAvgPool1d(1)
        self.output = nn.Linear(emb_dim + 1, hyper_params['num_class'])

        self.to(device)

        with Path('../data/utils/cheatsheet.txt').open(
                'r', encoding='utf-8-sig') as f:
            self.cheatsheet = set([line.strip() for line in f.readlines()])

        self.added_stop_words = StopWords(with_applied=True).get_instance()
        self.tokenizer = Tokenizer().get_instance()

Пример #5

Показать файл

Файл: Indexer.py Проект: keigotak/COVID19Tweet

    def __init__(self, special_tokens={'<s>': 0, '<unk>': 1, '<pad>': 2, '<\s>': 3, '<mask>': 4}, with_del_stopwords=False, lower_count=0):
        if special_tokens is None:
            self.word2index = {'<unk>': 0, '<pad>': 1}
            self.current = 2
        else:
            self.word2index = special_tokens
            self.current = len(special_tokens)
        self.index2word = {val: key for key, val in special_tokens.items()}
        self.vocab = set([key for key, val in special_tokens.items()])

        self.sentence2indexes, self.indexes2sentence = SentenceIndexer().get_instance()

        self.padding_index = self.word2index['<pad>']
        self.unknown_index = self.word2index['<unk>']

        self.delim = ' '
        self.counts = {}
        self.lower_count = lower_count
        self.max_length = 0

        self.stop_words = StopWords().get_instance()
        self.text_processor = Tokenizer().get_instance()
        self.with_del_stopwords = with_del_stopwords

Пример #6

Показать файл

Файл: Partition.py Проект: bandjay/NER

 def __init__(self, punctuation, data_inter_path, data_path):
     self.punctuation = set(punctuation)
     self.num_words = 0
     self.sw = StopWords(data_path)
     os.chdir(data_inter_path)
     self.f = open('phrase_segments.txt', 'w')

Пример #7

Показать файл

Файл: DataManager.py Проект: gabrielsscavalcante/BayesTextMining

class DataManager:
    data = []
    trainingData = []
    testData = []
    stopWords = StopWords().list

    #Training Data
    titles = []
    texts = []
    sentiments = []
    words = []

    countingWords = {}
    badWords = {}
    neutralWords = {}
    goodWords = {}

    #Test Data
    phrases = []
    textsTest = []
    sentimentsTest = []

    def __init__(self):
        self.data = self.getData()
        self.separateData()
        self.organizeTrainingData()
        self.separateTrainingWords()
        self.separateTestPhrases()

    def getData(self):
        read = []
        with open('chennai.csv', 'r') as csvfile:
            readCSV = csv.reader(csvfile, delimiter=';')

            for row in readCSV:
                read.append(row)

        #random.shuffle(read)
        return read

    def separateData(self):
        sizeTraining = int(0.8 * len(self.data))
        size = len(self.data)
        self.data.pop(0)
        random.shuffle(self.data)
        self.trainingData = self.data[1:sizeTraining]
        self.testData = self.data[sizeTraining:size]

    def organizeTrainingData(self):
        for row in self.trainingData:
            self.titles.append(row[1])
            self.texts.append(row[2])
            self.sentiments.append(row[3])

    def addToArrayUnique(self, word, local):
        if word.lower() not in local and word.lower(
        ) not in DataManager.stopWords:
            finalWord = self.removeCharacters(word.lower())
            if finalWord != "":
                local.append(finalWord)

    def addToDictionary(self, words, local):
        for word in words:
            if local.has_key(word):
                local[word] = local[word] + 1
            else:
                local[word] = 1

    def separateTrainingWords(self):
        for i in range(0, len(self.texts)):
            wordsPerText = self.texts[i].split()
            for word in wordsPerText:
                self.addToArrayUnique(word, self.words)

            if self.sentiments[i] == '1':
                self.addToDictionary(self.words, self.badWords)
            elif self.sentiments[i] == '2':
                self.addToDictionary(self.words, self.neutralWords)
            elif self.sentiments[i] == '3':
                self.addToDictionary(self.words, self.goodWords)

            self.addToDictionary(self.words, self.countingWords)
            self.cleanData()

    def separateTestPhrases(self):
        a = 1
        dictionaryRow = {}
        dictionaryWords = {}
        for row in self.testData:
            if a == 2:
                dictionaryRow['titles'] = row[1]
                self.addToDictionary(self.separeteWords(row[2]),
                                     dictionaryWords)
                dictionaryRow['probabilityWords'] = dictionaryWords
                dictionaryRow['sentiments'] = row[3]
                self.phrases.append(dictionaryRow)
                dictionaryRow = {}
                dictionaryWords = {}
            else:
                a = 2

    def separeteWords(self, text):
        separatedWords = []
        wordsPerText = text.split()
        for word in wordsPerText:
            self.addToArrayUnique(word, separatedWords)

        return separatedWords

    def removeCharacters(self, word):

        # Unicode normalize transforma um caracter em seu equivalente em latin.
        # nfkd = unicodedata.normalize('NFKD', word)
        # palavraSemAcento = u''.join([c for c in nfkd if not unicodedata.combining(c)])

        # Usa expressão regular para retornar a palavra apenas com números, letras e espaço
        return re.sub('[^a-zA-Z0-9 \\\]', '', word)

    def cleanData(self):
        self.words = []

Пример #8

Показать файл

Файл: Frequency.py Проект: usmansabir98/Vector_Space_Model

 def __init__(self):
     self.collection = [['w1', 'w2', 'w4', 'w6'], ['w1', 'w2', 'w7', 'w3'],
                        ['w8', 'w5', 'w4', 'w5', 'w6']]
     self.dictionary = {}
     self.stopWords = StopWords(
         "D:/Information Retrieval/Assignment 2/stop words.txt")

Пример #9

Показать файл

Файл: PositionalIndex.py Проект: Ammarbaig123/Boolean-Information-Retrieval-Model

 def __init__(self):
   
     self.dictionary = {}
     self.stopWords = StopWords("E:/New folder (2)/IR assi/stop words.txt")

Пример #10

Показать файл

 def __init__(self, punctuation):
     self.punctuation = set(punctuation)
     self.num_words = 0
     self.f = open('Intermediate/phrase_segments.txt', 'w')
     self.sw = StopWords()