def __init__(self): self.collection = [['a', 'word', 'a', 'word', 'the'], ['the', 'a', 'brown', 'cat', 'the', 'a'], ['brown', 'cat', 'the', 'a', 'word']] self.dictionary = {} self.stopWords = StopWords( "D:/Information Retrieval/IR/stop words.txt")
def __init__(self, path): self.Documents = [] self.allowed = set([chr(i) for i in xrange(ord('a'), ord('z')+1)]+ \ [chr(i) for i in xrange(ord('A'), ord('Z')+1)] + \ #[',','-',' '] + [str(i) for i in xrange(10)]) [',','.','?','-','!',' '] + [str(i) for i in xrange(10)]) self.punctuation = [';', ':', '&', '?', "/"] self.P = Partition(self.punctuation) self.tagger = PatternTagger() self.sw = StopWords() with open(path, 'r') as f: for line in f: line = line.strip() if line: self.Documents.append(line)
def __init__(self, path): data_home = os.path.split(path)[0] self.Documents = [] self.allowed = set([chr(i) for i in xrange(ord('a'), ord('z')+1)]+ \ [chr(i) for i in xrange(ord('A'), ord('Z')+1)] + \ #[',','-',' '] + [str(i) for i in xrange(10)]) [',','.','?','-','!',' '] + [str(i) for i in xrange(10)]) punctuation = [';', ':', '&', '?', "/"] #P = Partition(punctuation) self.tagger = PatternTagger() with open(path, 'r') as f: for line in f.readlines(): li = line.split("\t")[1].strip() if li: self.Documents.append(li) data_Inter_path = os.path.join(data_home, "Intermediate") self.inter = data_Inter_path self.P = Partition(punctuation, data_Inter_path, data_home) self.sw = StopWords(data_home)
def __init__(self, device='cpu', hyper_params=None): sup = super() sup.__init__(device=device, hyper_params=hyper_params) self.embeddings = nn.ModuleList([ sup.get_embeddings(key=key, device=device) for key in self.hyper_params['embeddings'] ]) emb_dim = sum([item.embedding_dim for item in self.embeddings]) self.hidden_size = emb_dim self.f_gru1 = nn.GRU(input_size=emb_dim, hidden_size=emb_dim, batch_first=True) self.b_gru1 = nn.GRU(input_size=emb_dim, hidden_size=emb_dim, batch_first=True) self.f_gru2 = nn.GRU(input_size=emb_dim, hidden_size=emb_dim, batch_first=True) self.b_gru2 = nn.GRU(input_size=emb_dim, hidden_size=emb_dim, batch_first=True) self.num_head = hyper_params['num_head'] self.attention = nn.ModuleList( [Attention(dimensions=emb_dim) for _ in range(self.num_head)]) self.dropout = nn.Dropout(hyper_params['dropout_ratio']) self.pooling = nn.AdaptiveAvgPool1d(1) self.output = nn.Linear(emb_dim + 1, hyper_params['num_class']) self.to(device) with Path('../data/utils/cheatsheet.txt').open( 'r', encoding='utf-8-sig') as f: self.cheatsheet = set([line.strip() for line in f.readlines()]) self.added_stop_words = StopWords(with_applied=True).get_instance() self.tokenizer = Tokenizer().get_instance()
def __init__(self, special_tokens={'<s>': 0, '<unk>': 1, '<pad>': 2, '<\s>': 3, '<mask>': 4}, with_del_stopwords=False, lower_count=0): if special_tokens is None: self.word2index = {'<unk>': 0, '<pad>': 1} self.current = 2 else: self.word2index = special_tokens self.current = len(special_tokens) self.index2word = {val: key for key, val in special_tokens.items()} self.vocab = set([key for key, val in special_tokens.items()]) self.sentence2indexes, self.indexes2sentence = SentenceIndexer().get_instance() self.padding_index = self.word2index['<pad>'] self.unknown_index = self.word2index['<unk>'] self.delim = ' ' self.counts = {} self.lower_count = lower_count self.max_length = 0 self.stop_words = StopWords().get_instance() self.text_processor = Tokenizer().get_instance() self.with_del_stopwords = with_del_stopwords
def __init__(self, punctuation, data_inter_path, data_path): self.punctuation = set(punctuation) self.num_words = 0 self.sw = StopWords(data_path) os.chdir(data_inter_path) self.f = open('phrase_segments.txt', 'w')
class DataManager: data = [] trainingData = [] testData = [] stopWords = StopWords().list #Training Data titles = [] texts = [] sentiments = [] words = [] countingWords = {} badWords = {} neutralWords = {} goodWords = {} #Test Data phrases = [] textsTest = [] sentimentsTest = [] def __init__(self): self.data = self.getData() self.separateData() self.organizeTrainingData() self.separateTrainingWords() self.separateTestPhrases() def getData(self): read = [] with open('chennai.csv', 'r') as csvfile: readCSV = csv.reader(csvfile, delimiter=';') for row in readCSV: read.append(row) #random.shuffle(read) return read def separateData(self): sizeTraining = int(0.8 * len(self.data)) size = len(self.data) self.data.pop(0) random.shuffle(self.data) self.trainingData = self.data[1:sizeTraining] self.testData = self.data[sizeTraining:size] def organizeTrainingData(self): for row in self.trainingData: self.titles.append(row[1]) self.texts.append(row[2]) self.sentiments.append(row[3]) def addToArrayUnique(self, word, local): if word.lower() not in local and word.lower( ) not in DataManager.stopWords: finalWord = self.removeCharacters(word.lower()) if finalWord != "": local.append(finalWord) def addToDictionary(self, words, local): for word in words: if local.has_key(word): local[word] = local[word] + 1 else: local[word] = 1 def separateTrainingWords(self): for i in range(0, len(self.texts)): wordsPerText = self.texts[i].split() for word in wordsPerText: self.addToArrayUnique(word, self.words) if self.sentiments[i] == '1': self.addToDictionary(self.words, self.badWords) elif self.sentiments[i] == '2': self.addToDictionary(self.words, self.neutralWords) elif self.sentiments[i] == '3': self.addToDictionary(self.words, self.goodWords) self.addToDictionary(self.words, self.countingWords) self.cleanData() def separateTestPhrases(self): a = 1 dictionaryRow = {} dictionaryWords = {} for row in self.testData: if a == 2: dictionaryRow['titles'] = row[1] self.addToDictionary(self.separeteWords(row[2]), dictionaryWords) dictionaryRow['probabilityWords'] = dictionaryWords dictionaryRow['sentiments'] = row[3] self.phrases.append(dictionaryRow) dictionaryRow = {} dictionaryWords = {} else: a = 2 def separeteWords(self, text): separatedWords = [] wordsPerText = text.split() for word in wordsPerText: self.addToArrayUnique(word, separatedWords) return separatedWords def removeCharacters(self, word): # Unicode normalize transforma um caracter em seu equivalente em latin. # nfkd = unicodedata.normalize('NFKD', word) # palavraSemAcento = u''.join([c for c in nfkd if not unicodedata.combining(c)]) # Usa expressão regular para retornar a palavra apenas com números, letras e espaço return re.sub('[^a-zA-Z0-9 \\\]', '', word) def cleanData(self): self.words = []
def __init__(self): self.collection = [['w1', 'w2', 'w4', 'w6'], ['w1', 'w2', 'w7', 'w3'], ['w8', 'w5', 'w4', 'w5', 'w6']] self.dictionary = {} self.stopWords = StopWords( "D:/Information Retrieval/Assignment 2/stop words.txt")
def __init__(self): self.dictionary = {} self.stopWords = StopWords("E:/New folder (2)/IR assi/stop words.txt")
def __init__(self, punctuation): self.punctuation = set(punctuation) self.num_words = 0 self.f = open('Intermediate/phrase_segments.txt', 'w') self.sw = StopWords()