def __init__(self):
     self.collection = [['a', 'word', 'a', 'word', 'the'],
                        ['the', 'a', 'brown', 'cat', 'the', 'a'],
                        ['brown', 'cat', 'the', 'a', 'word']]
     self.dictionary = {}
     self.stopWords = StopWords(
         "D:/Information Retrieval/IR/stop words.txt")
Пример #2
0
    def __init__(self, path):
        self.Documents = []
        self.allowed = set([chr(i) for i in xrange(ord('a'), ord('z')+1)]+ \
                [chr(i) for i in xrange(ord('A'), ord('Z')+1)] + \
            #[',','-',' '] + [str(i) for i in xrange(10)])

                [',','.','?','-','!',' '] + [str(i) for i in xrange(10)])
        self.punctuation = [';', ':', '&', '?', "/"]
        self.P = Partition(self.punctuation)
        self.tagger = PatternTagger()
        self.sw = StopWords()
        with open(path, 'r') as f:
            for line in f:
                line = line.strip()
                if line:
                    self.Documents.append(line)
Пример #3
0
    def __init__(self, path):
        data_home = os.path.split(path)[0]
        self.Documents = []
        self.allowed = set([chr(i) for i in xrange(ord('a'), ord('z')+1)]+ \
                [chr(i) for i in xrange(ord('A'), ord('Z')+1)] + \
            #[',','-',' '] + [str(i) for i in xrange(10)])

                [',','.','?','-','!',' '] + [str(i) for i in xrange(10)])
        punctuation = [';', ':', '&', '?', "/"]

        #P = Partition(punctuation)
        self.tagger = PatternTagger()
        with open(path, 'r') as f:
            for line in f.readlines():
                li = line.split("\t")[1].strip()
                if li:
                    self.Documents.append(li)
        data_Inter_path = os.path.join(data_home, "Intermediate")
        self.inter = data_Inter_path
        self.P = Partition(punctuation, data_Inter_path, data_home)
        self.sw = StopWords(data_home)
Пример #4
0
    def __init__(self, device='cpu', hyper_params=None):
        sup = super()
        sup.__init__(device=device, hyper_params=hyper_params)
        self.embeddings = nn.ModuleList([
            sup.get_embeddings(key=key, device=device)
            for key in self.hyper_params['embeddings']
        ])

        emb_dim = sum([item.embedding_dim for item in self.embeddings])
        self.hidden_size = emb_dim
        self.f_gru1 = nn.GRU(input_size=emb_dim,
                             hidden_size=emb_dim,
                             batch_first=True)
        self.b_gru1 = nn.GRU(input_size=emb_dim,
                             hidden_size=emb_dim,
                             batch_first=True)
        self.f_gru2 = nn.GRU(input_size=emb_dim,
                             hidden_size=emb_dim,
                             batch_first=True)
        self.b_gru2 = nn.GRU(input_size=emb_dim,
                             hidden_size=emb_dim,
                             batch_first=True)

        self.num_head = hyper_params['num_head']
        self.attention = nn.ModuleList(
            [Attention(dimensions=emb_dim) for _ in range(self.num_head)])

        self.dropout = nn.Dropout(hyper_params['dropout_ratio'])

        self.pooling = nn.AdaptiveAvgPool1d(1)
        self.output = nn.Linear(emb_dim + 1, hyper_params['num_class'])

        self.to(device)

        with Path('../data/utils/cheatsheet.txt').open(
                'r', encoding='utf-8-sig') as f:
            self.cheatsheet = set([line.strip() for line in f.readlines()])

        self.added_stop_words = StopWords(with_applied=True).get_instance()
        self.tokenizer = Tokenizer().get_instance()
Пример #5
0
    def __init__(self, special_tokens={'<s>': 0, '<unk>': 1, '<pad>': 2, '<\s>': 3, '<mask>': 4}, with_del_stopwords=False, lower_count=0):
        if special_tokens is None:
            self.word2index = {'<unk>': 0, '<pad>': 1}
            self.current = 2
        else:
            self.word2index = special_tokens
            self.current = len(special_tokens)
        self.index2word = {val: key for key, val in special_tokens.items()}
        self.vocab = set([key for key, val in special_tokens.items()])

        self.sentence2indexes, self.indexes2sentence = SentenceIndexer().get_instance()

        self.padding_index = self.word2index['<pad>']
        self.unknown_index = self.word2index['<unk>']

        self.delim = ' '
        self.counts = {}
        self.lower_count = lower_count
        self.max_length = 0

        self.stop_words = StopWords().get_instance()
        self.text_processor = Tokenizer().get_instance()
        self.with_del_stopwords = with_del_stopwords
Пример #6
0
 def __init__(self, punctuation, data_inter_path, data_path):
     self.punctuation = set(punctuation)
     self.num_words = 0
     self.sw = StopWords(data_path)
     os.chdir(data_inter_path)
     self.f = open('phrase_segments.txt', 'w')
class DataManager:
    data = []
    trainingData = []
    testData = []
    stopWords = StopWords().list

    #Training Data
    titles = []
    texts = []
    sentiments = []
    words = []

    countingWords = {}
    badWords = {}
    neutralWords = {}
    goodWords = {}

    #Test Data
    phrases = []
    textsTest = []
    sentimentsTest = []

    def __init__(self):
        self.data = self.getData()
        self.separateData()
        self.organizeTrainingData()
        self.separateTrainingWords()
        self.separateTestPhrases()

    def getData(self):
        read = []
        with open('chennai.csv', 'r') as csvfile:
            readCSV = csv.reader(csvfile, delimiter=';')

            for row in readCSV:
                read.append(row)

        #random.shuffle(read)
        return read

    def separateData(self):
        sizeTraining = int(0.8 * len(self.data))
        size = len(self.data)
        self.data.pop(0)
        random.shuffle(self.data)
        self.trainingData = self.data[1:sizeTraining]
        self.testData = self.data[sizeTraining:size]

    def organizeTrainingData(self):
        for row in self.trainingData:
            self.titles.append(row[1])
            self.texts.append(row[2])
            self.sentiments.append(row[3])

    def addToArrayUnique(self, word, local):
        if word.lower() not in local and word.lower(
        ) not in DataManager.stopWords:
            finalWord = self.removeCharacters(word.lower())
            if finalWord != "":
                local.append(finalWord)

    def addToDictionary(self, words, local):
        for word in words:
            if local.has_key(word):
                local[word] = local[word] + 1
            else:
                local[word] = 1

    def separateTrainingWords(self):
        for i in range(0, len(self.texts)):
            wordsPerText = self.texts[i].split()
            for word in wordsPerText:
                self.addToArrayUnique(word, self.words)

            if self.sentiments[i] == '1':
                self.addToDictionary(self.words, self.badWords)
            elif self.sentiments[i] == '2':
                self.addToDictionary(self.words, self.neutralWords)
            elif self.sentiments[i] == '3':
                self.addToDictionary(self.words, self.goodWords)

            self.addToDictionary(self.words, self.countingWords)
            self.cleanData()

    def separateTestPhrases(self):
        a = 1
        dictionaryRow = {}
        dictionaryWords = {}
        for row in self.testData:
            if a == 2:
                dictionaryRow['titles'] = row[1]
                self.addToDictionary(self.separeteWords(row[2]),
                                     dictionaryWords)
                dictionaryRow['probabilityWords'] = dictionaryWords
                dictionaryRow['sentiments'] = row[3]
                self.phrases.append(dictionaryRow)
                dictionaryRow = {}
                dictionaryWords = {}
            else:
                a = 2

    def separeteWords(self, text):
        separatedWords = []
        wordsPerText = text.split()
        for word in wordsPerText:
            self.addToArrayUnique(word, separatedWords)

        return separatedWords

    def removeCharacters(self, word):

        # Unicode normalize transforma um caracter em seu equivalente em latin.
        # nfkd = unicodedata.normalize('NFKD', word)
        # palavraSemAcento = u''.join([c for c in nfkd if not unicodedata.combining(c)])

        # Usa expressão regular para retornar a palavra apenas com números, letras e espaço
        return re.sub('[^a-zA-Z0-9 \\\]', '', word)

    def cleanData(self):
        self.words = []
Пример #8
0
 def __init__(self):
     self.collection = [['w1', 'w2', 'w4', 'w6'], ['w1', 'w2', 'w7', 'w3'],
                        ['w8', 'w5', 'w4', 'w5', 'w6']]
     self.dictionary = {}
     self.stopWords = StopWords(
         "D:/Information Retrieval/Assignment 2/stop words.txt")
 def __init__(self):
   
     self.dictionary = {}
     self.stopWords = StopWords("E:/New folder (2)/IR assi/stop words.txt")
Пример #10
0
 def __init__(self, punctuation):
     self.punctuation = set(punctuation)
     self.num_words = 0
     self.f = open('Intermediate/phrase_segments.txt', 'w')
     self.sw = StopWords()