def __init__(self):
     self.collection = [['a', 'word', 'a', 'word', 'the'],
                        ['the', 'a', 'brown', 'cat', 'the', 'a'],
                        ['brown', 'cat', 'the', 'a', 'word']]
     self.dictionary = {}
     self.stopWords = StopWords(
         "D:/Information Retrieval/IR/stop words.txt")
Пример #2
0
class TestStopWords(unittest.TestCase):
    def setUp(self):
        self.s = StopWords()

    def testIsStopWord(self):
        self.assertTrue(self.s.is_stop_word('a'))

    def testIsStopWord2(self):
        self.assertFalse(self.s.is_stop_word('rare word'))
Пример #3
0
class Partition:
    def __init__(self, punctuation, data_inter_path, data_path):
        self.punctuation = set(punctuation)
        self.num_words = 0
        self.sw = StopWords(data_path)
        os.chdir(data_inter_path)
        self.f = open('phrase_segments.txt', 'w')

    def split(self, sentence):
        new_sent = [None] * len(sentence)
        for i in xrange(len(sentence)):
            if sentence[i] in self.punctuation:
                new_sent[i] = ','
            else:
                new_sent[i] = sentence[i]
        mining_sentence = "".join(new_sent).lower().split(',')
        sentence = sentence.split(",")

        for seg in mining_sentence:
            seg = seg.split()
            new_set = []
            for word in seg:
                if not self.sw.isStopWord(word):
                    new_set.append(word)
            seg = " ".join(new_set)
            seg = seg.strip()
            if seg:
                self.f.write(seg + "\n")
        return sentence
Пример #4
0
class Partition:
    def __init__(self, punctuation):
        self.punctuation = set(punctuation)
        self.num_words = 0
        self.f = open('Intermediate/phrase_segments.txt','w')
        self.sw = StopWords()

    def split(self, sentence):
        new_sent = [None]*len(sentence)
        for i in xrange(len(sentence)):
            if sentence[i] in self.punctuation:
                new_sent[i] = ','
            else:
                new_sent[i] = sentence[i]
        mining_sentence = "".join(new_sent).lower().split(',')
        sentence = sentence.split(",")

        for seg in mining_sentence:
            seg = seg.split()
            new_set = []
            for word in seg:
                if not self.sw.isStopWord(word):
                    new_set.append(word)
            seg = " ".join(new_set)
            seg = seg.strip()
            if seg:
                self.f.write(seg+"\n")
        return sentence
Пример #5
0
    def __init__(self, path):
        self.Documents = []
        self.allowed = set([chr(i) for i in xrange(ord('a'), ord('z')+1)]+ \
                [chr(i) for i in xrange(ord('A'), ord('Z')+1)] + \
            #[',','-',' '] + [str(i) for i in xrange(10)])

                [',','.','?','-','!',' '] + [str(i) for i in xrange(10)])
        self.punctuation = [';', ':', '&', '?', "/"]
        self.P = Partition(self.punctuation)
        self.tagger = PatternTagger()
        self.sw = StopWords()
        with open(path, 'r') as f:
            for line in f:
                line = line.strip()
                if line:
                    self.Documents.append(line)
Пример #6
0
    def __init__(self, path):
        data_home = os.path.split(path)[0]
        self.Documents = []
        self.allowed = set([chr(i) for i in xrange(ord('a'), ord('z')+1)]+ \
                [chr(i) for i in xrange(ord('A'), ord('Z')+1)] + \
            #[',','-',' '] + [str(i) for i in xrange(10)])

                [',','.','?','-','!',' '] + [str(i) for i in xrange(10)])
        punctuation = [';', ':', '&', '?', "/"]

        #P = Partition(punctuation)
        self.tagger = PatternTagger()
        with open(path, 'r') as f:
            for line in f.readlines():
                li = line.split("\t")[1].strip()
                if li:
                    self.Documents.append(li)
        data_Inter_path = os.path.join(data_home, "Intermediate")
        self.inter = data_Inter_path
        self.P = Partition(punctuation, data_Inter_path, data_home)
        self.sw = StopWords(data_home)
class PositionalIndex():
    def __init__(self):
        self.collection = [['a', 'word', 'a', 'word', 'the'],
                           ['the', 'a', 'brown', 'cat', 'the', 'a'],
                           ['brown', 'cat', 'the', 'a', 'word']]
        self.dictionary = {}
        self.stopWords = StopWords(
            "D:/Information Retrieval/IR/stop words.txt")

    def loadDocuments(self):
        self.collection = []
        for i in range(1, 51):
            filename = "D:/Information Retrieval/IR/ShortStories/" + str(
                i) + ".txt"
            s = ""
            with open(filename) as f_obj:
                for line in f_obj:
                    if (line != '\n'):
                        l = re.sub('[^a-zA-Z0-9\s]|[\n]', '', line)
                        l = self.stopWords.removeWords(l)
                        s = s + l.lower() + " "
            lines = s.split(" ")
            self.collection.append(lines)

    def buildDictionary(self):
        for i in range(0, len(self.collection)):
            array = self.collection[i]

            for j in range(0, len(array)):
                if (array[j] not in self.dictionary):
                    docId = i + 1
                    d = {docId: [j]}
                    self.dictionary[array[j]] = d
                else:
                    d = self.dictionary[array[j]]

                    if (i + 1) in d:
                        l = d[i + 1]
                        l.append(j)
                        d[i + 1] = l

                    else:
                        docId = i + 1
                        d[docId] = [j]

                    self.dictionary[array[j]] = d

    def getPositionalIndex(self, key):
        if key not in self.dictionary:
            return []
        return self.dictionary.get(key)
Пример #8
0
 def __init__(self, path):
     self.Documents = []
     self.allowed = set([chr(i) for i in xrange(ord('a'), ord('z')+1)]+ \
             [chr(i) for i in xrange(ord('A'), ord('Z')+1)] + \
         #[',','-',' '] + [str(i) for i in xrange(10)])
             [',','.','?','-','!',' '] + [str(i) for i in xrange(10)])
     self.punctuation = [';',':','&', '?', "/"]
     self.P = Partition(self.punctuation)
     self.tagger = PatternTagger()
     self.sw = StopWords()
     with open(path,'r') as f:
         for line in f:
             line = line.strip()
             if line:
                 self.Documents.append(line)
Пример #9
0
    def term_to_id(self, term0):

        term = Preprocessing.convert_word_to_normal_form(term0)
        term = Preprocessing.lemmatize(term)
        if not re.match(r'[a-zа-я]+$', term):
            return None
        if self.excluds_stopwords and StopWords.is_stop_word(term):
            return None
        try:
            term_id = self.vocas_id[term]
        except:
            term_id = len(self.vocas)
            self.vocas_id[term] = term_id
            self.vocas.append(term)
            self.docfreq.append(0)
        return term_id
Пример #10
0
class InvertedIndex():
    def __init__(self):
        self.collection = [
            ['a','word','a','word','the'],
            ['the', 'a', 'brown', 'cat', 'the', 'a'],
            ['brown', 'cat', 'the', 'a', 'word']
        ]
        self.dictionary = {}
        self.stopWords = StopWords("D:/Information Retrieval/IR/stop words.txt")

    def loadDocuments(self):
        self.collection = []
        for i in range(1, 51):
            filename = "D:/Information Retrieval/IR/ShortStories/"+str(i)+".txt"
            s = ""
            with open(filename) as f_obj:
                for line in f_obj:
                    if(line != '\n'):
                        l = re.sub('[^a-zA-Z0-9\s]|[\n]', '', line)
                        l = self.stopWords.removeWords(l)
                        s = s + l.lower() + " "
            lines = s.split(" ")
            self.collection.append(lines)

    def buildDictionary(self):
        for i in range(0 ,len(self.collection)):
            array = self.collection[i]

            for j in range(0,len(array)):
                if(array[j] not in self.dictionary):
                    l = []
                    l.append(i+1)
                    self.dictionary[array[j]] = l
                else:
                    l = self.dictionary[array[j]]
                    l.append(i+1)
                    self.dictionary[array[j]] = l

        for key,value in self.dictionary.items():
            self.dictionary[key] = list(set(value))

    def getInvertedIndex(self, key):
        if key not in self.dictionary:
            return []
        return self.dictionary.get(key)
Пример #11
0
 def __init__(self, path):
     self.Documents = []
     self.allowed = set(
         [chr(i) for i in xrange(ord("a"), ord("z") + 1)]
         + [chr(i) for i in xrange(ord("A"), ord("Z") + 1)]
         +
         # [',','-',' '] + [str(i) for i in xrange(10)])
         [",", ".", "?", "-", "!", " "]
         + [str(i) for i in xrange(10)]
     )
     self.punctuation = [";", ":", "&", "?", "/"]
     self.P = Partition(self.punctuation)
     self.tagger = PatternTagger()
     self.sw = StopWords()
     with open(path, "r") as f:
         for line in f:
             line = line.strip()
             if line:
                 self.Documents.append(line)
Пример #12
0
    def __init__(self, device='cpu', hyper_params=None):
        sup = super()
        sup.__init__(device=device, hyper_params=hyper_params)
        self.embeddings = nn.ModuleList([
            sup.get_embeddings(key=key, device=device)
            for key in self.hyper_params['embeddings']
        ])

        emb_dim = sum([item.embedding_dim for item in self.embeddings])
        self.hidden_size = emb_dim
        self.f_gru1 = nn.GRU(input_size=emb_dim,
                             hidden_size=emb_dim,
                             batch_first=True)
        self.b_gru1 = nn.GRU(input_size=emb_dim,
                             hidden_size=emb_dim,
                             batch_first=True)
        self.f_gru2 = nn.GRU(input_size=emb_dim,
                             hidden_size=emb_dim,
                             batch_first=True)
        self.b_gru2 = nn.GRU(input_size=emb_dim,
                             hidden_size=emb_dim,
                             batch_first=True)

        self.num_head = hyper_params['num_head']
        self.attention = nn.ModuleList(
            [Attention(dimensions=emb_dim) for _ in range(self.num_head)])

        self.dropout = nn.Dropout(hyper_params['dropout_ratio'])

        self.pooling = nn.AdaptiveAvgPool1d(1)
        self.output = nn.Linear(emb_dim + 1, hyper_params['num_class'])

        self.to(device)

        with Path('../data/utils/cheatsheet.txt').open(
                'r', encoding='utf-8-sig') as f:
            self.cheatsheet = set([line.strip() for line in f.readlines()])

        self.added_stop_words = StopWords(with_applied=True).get_instance()
        self.tokenizer = Tokenizer().get_instance()
Пример #13
0
    def __init__(self, special_tokens={'<s>': 0, '<unk>': 1, '<pad>': 2, '<\s>': 3, '<mask>': 4}, with_del_stopwords=False, lower_count=0):
        if special_tokens is None:
            self.word2index = {'<unk>': 0, '<pad>': 1}
            self.current = 2
        else:
            self.word2index = special_tokens
            self.current = len(special_tokens)
        self.index2word = {val: key for key, val in special_tokens.items()}
        self.vocab = set([key for key, val in special_tokens.items()])

        self.sentence2indexes, self.indexes2sentence = SentenceIndexer().get_instance()

        self.padding_index = self.word2index['<pad>']
        self.unknown_index = self.word2index['<unk>']

        self.delim = ' '
        self.counts = {}
        self.lower_count = lower_count
        self.max_length = 0

        self.stop_words = StopWords().get_instance()
        self.text_processor = Tokenizer().get_instance()
        self.with_del_stopwords = with_del_stopwords
Пример #14
0
 def __init__(self):
     self.collection = [['w1', 'w2', 'w4', 'w6'], ['w1', 'w2', 'w7', 'w3'],
                        ['w8', 'w5', 'w4', 'w5', 'w6']]
     self.dictionary = {}
     self.stopWords = StopWords(
         "D:/Information Retrieval/Assignment 2/stop words.txt")
Пример #15
0
class Clean:
    def __init__(self, path):
        self.Documents = []
        self.allowed = set([chr(i) for i in xrange(ord('a'), ord('z')+1)]+ \
                [chr(i) for i in xrange(ord('A'), ord('Z')+1)] + \
            #[',','-',' '] + [str(i) for i in xrange(10)])

                [',','.','?','-','!',' '] + [str(i) for i in xrange(10)])
        self.punctuation = [';', ':', '&', '?', "/"]
        self.P = Partition(self.punctuation)
        self.tagger = PatternTagger()
        self.sw = StopWords()
        with open(path, 'r') as f:
            for line in f:
                line = line.strip()
                if line:
                    self.Documents.append(line)

    def is_number(self, s):
        try:
            float(s)
            return True
        except ValueError:
            return False

    def remove_stopwords(self, words, pos):
        new_sent = []
        new_pos = []
        for i in xrange(len(words)):
            if not self.sw.isStopWord(words[i]):
                new_sent.append(words[i])
                new_pos.append(pos[i])
        return new_sent, new_pos

    def replace_nums(self, s):
        sent = str(s)
        if sent[len(sent) - 1] == ".":
            sent = sent[0:len(sent) - 1]
        sent = sent.split()
        new_sent = []
        for word in sent:
            if self.is_number(word):
                pass
                #new_sent.append("999999")
            else:
                new_sent.append(word)
        sent = " ".join(new_sent)

        return sent

    def remove_things(self, string):
        string = string.replace("\t", " ")
        string = string.replace(" and ", ", and ")
        new_string = [char for char in string if char in self.allowed]
        return "".join(new_string)

    def clean_and_tag(self):
        with open('Intermediate/full_sentences.txt', 'w') as f,\
                open('Intermediate/full_pos.txt','w') as g,\
                open('Intermediate/sentences.txt', 'w') as m,\
                open('Intermediate/pos.txt', 'w') as n:
            for i in xrange(len(self.Documents)):
                if i % 10000 == 0 and i != 0:
                    print str(i) + " documents processed."
                doc = self.Documents[i]
                cleaned_doc = self.remove_things(doc)
                blob = TextBlob(cleaned_doc)
                for j in xrange(len(blob.sentences)):
                    sent = blob.sentences[j]
                    sent = self.replace_nums(sent)
                    split_sentence = self.P.split(sent)

                    for k in xrange(len(split_sentence)):
                        frag = split_sentence[k]
                        sent_blob = TextBlob(frag, pos_tagger=self.tagger)
                        words, pos = [], []
                        for word, tag in sent_blob.pos_tags:
                            words.append(word)
                            pos.append(tag)
                        f.write(
                            str(i) + ":" + str(j) + ":" + str(k) + ":" +
                            (" ".join(words) + "\n"))
                        g.write(" ".join(pos) + "\n")
                        no_stop_words, no_stop_pos = self.remove_stopwords(
                            words, pos)
                        m.write(
                            str(i) + ":" + str(j) + ":" + str(k) + ":" +
                            (" ".join(no_stop_words) + "\n"))
                        n.write(" ".join(no_stop_pos) + "\n")
Пример #16
0
 def __init__(self, punctuation):
     self.punctuation = set(punctuation)
     self.num_words = 0
     self.f = open('Intermediate/phrase_segments.txt','w')
     self.sw = StopWords()
Пример #17
0
 def __init__(self, fh):
     self.stop_words = StopWords()
     self.__fh = fh
     self.__reader = TxtReader(fh)
     self.build_index()
Пример #18
0
class TxtIndex:
    def __init__(self, fh):
        self.stop_words = StopWords()
        self.__fh = fh
        self.__reader = TxtReader(fh)
        self.build_index()

    def build_index(self):
        self.keyword2pointers = {}
        self.__reader.seek(0)

        while True:
            word = self.__reader.nextWord()
            if word == None:
                break

            word = self.norm_word(word)
            if self.stop_words.is_stop_word(word):
                continue

            if word not in self.keyword2pointers:
                self.keyword2pointers[word] = []

            self.keyword2pointers[word].append(self.__reader.startPointer())

    def norm_word(self, word):
        word = word.lower()
        word = word.rstrip(string.punctuation)
        return word

    def norm_phrase(self, phrase):
        phrase = re.sub('\s+', ' ', phrase)
        return ' '.join([self.norm_word(x) for x in phrase.split(' ')])

    def get_pointers(self, word):
        return self.keyword2pointers.get(self.norm_word(word))

    def exact_search(self, phrase):
        phrase = self.norm_phrase(phrase)
        words = phrase.split(' ')

        if len(words) == 1:
            return self.get_pointers(words[0])

        try_word = None
        try_word_idx = None
        try_word_pointers = []

        for i in range(len(words)):
            word = words[i]
            if self.stop_words.is_stop_word(word):
                continue

            pointers = self.get_pointers(word)
            if pointers == None:
                return None

            if try_word == None or len(try_word_pointers) > len(pointers):
                try_word = word
                try_word_idx = i
                try_word_pointers = pointers

        extend_left_by = len(' '.join(words[0:i])) + len(words[0:i])
        extend_right_by = len(' '.join(words[i:])) + len(words[i:])
        phrase_re = re.compile(phrase.replace(' ', '\s+'), re.I)
        found = []

        for pointer in try_word_pointers:
            s = pointer - extend_left_by
            l = extend_left_by + extend_right_by
            if s < 0:
                s = 0

            self.__fh.seek(s)
            excerpt = self.__fh.read(l)
            m = phrase_re.search(excerpt)
            if m:
                found.append(s + m.start())

        if len(found) > 0:
            return found
        else:
            return None

    def get_word_pointers(self, phrase):
        phrase = self.norm_phrase(phrase)
        words = phrase.split(' ')

        if len(words) == 1:
            return {words[0]: self.get_pointers(words[0])}

        word_pointers = {}

        for word in words:
            if word in word_pointers or self.stop_words.is_stop_word(word):
                continue

            word_pointers[word] = self.get_pointers(word)

        return word_pointers
Пример #19
0
 def __init__(self, punctuation):
     self.punctuation = set(punctuation)
     self.num_words = 0
     self.f = open('Intermediate/phrase_segments.txt', 'w')
     self.sw = StopWords()
Пример #20
0
 def __init__(self, punctuation, data_inter_path, data_path):
     self.punctuation = set(punctuation)
     self.num_words = 0
     self.sw = StopWords(data_path)
     os.chdir(data_inter_path)
     self.f = open('phrase_segments.txt', 'w')
Пример #21
0
class Clean:
    def __init__(self, path):
        self.Documents = []
        self.allowed = set([chr(i) for i in xrange(ord('a'), ord('z')+1)]+ \
                [chr(i) for i in xrange(ord('A'), ord('Z')+1)] + \
            #[',','-',' '] + [str(i) for i in xrange(10)])
                [',','.','?','-','!',' '] + [str(i) for i in xrange(10)])
        self.punctuation = [';',':','&', '?', "/"]
        self.P = Partition(self.punctuation)
        self.tagger = PatternTagger()
        self.sw = StopWords()
        with open(path,'r') as f:
            for line in f:
                line = line.strip()
                if line:
                    self.Documents.append(line)
    def is_number(self,s):
        try:
            float(s)
            return True
        except ValueError:
            return False
    def remove_stopwords(self, words, pos):
        new_sent = []
        new_pos = []
        for i in xrange(len(words)):
            if not self.sw.isStopWord(words[i]):
                new_sent.append(words[i])
                new_pos.append(pos[i])
        return new_sent,new_pos

    def replace_nums(self,s):
        sent = str(s)
        if sent[len(sent)-1] == ".":
            sent = sent[0:len(sent)-1]
        sent = sent.split()
        new_sent = []
        for word in sent:
            if self.is_number(word):
                pass
                #new_sent.append("999999")
            else:
                new_sent.append(word)
        sent = " ".join(new_sent)

        return sent
    def remove_things(self, string):
        string = string.replace("\t", " ")
        string = string.replace(" and ", ", and ")
        new_string = [char for char in string if char in self.allowed]
        return "".join(new_string)

    def clean_and_tag(self):
        with open('Intermediate/full_sentences.txt', 'w') as f,\
                open('Intermediate/full_pos.txt','w') as g,\
                open('Intermediate/sentences.txt', 'w') as m,\
                open('Intermediate/pos.txt', 'w') as n:
            for i in xrange(len(self.Documents)):
                if i%10000 == 0 and i!=0:
                    print str(i)+" documents processed."
                doc = self.Documents[i]
                cleaned_doc = self.remove_things(doc)
                blob = TextBlob(cleaned_doc)
                for j in xrange(len(blob.sentences)):
                    sent = blob.sentences[j]
                    sent = self.replace_nums(sent)
                    split_sentence = self.P.split(sent)

                    for k in xrange(len(split_sentence)):
                        frag = split_sentence[k]
                        sent_blob = TextBlob(frag, pos_tagger=self.tagger)
                        words, pos = [],[]
                        for word,tag in sent_blob.pos_tags:
                            words.append(word)
                            pos.append(tag)
                        f.write(str(i)+":"+str(j)+":"+str(k)+":"+(" ".join(words)+"\n"))
                        g.write(" ".join(pos)+"\n")
                        no_stop_words, no_stop_pos = self.remove_stopwords(words,pos)
                        m.write(str(i)+":"+str(j)+":"+str(k)+":"+(" ".join(no_stop_words)+"\n"))
                        n.write(" ".join(no_stop_pos)+"\n")
class DataManager:
    data = []
    trainingData = []
    testData = []
    stopWords = StopWords().list

    #Training Data
    titles = []
    texts = []
    sentiments = []
    words = []

    countingWords = {}
    badWords = {}
    neutralWords = {}
    goodWords = {}

    #Test Data
    phrases = []
    textsTest = []
    sentimentsTest = []

    def __init__(self):
        self.data = self.getData()
        self.separateData()
        self.organizeTrainingData()
        self.separateTrainingWords()
        self.separateTestPhrases()

    def getData(self):
        read = []
        with open('chennai.csv', 'r') as csvfile:
            readCSV = csv.reader(csvfile, delimiter=';')

            for row in readCSV:
                read.append(row)

        #random.shuffle(read)
        return read

    def separateData(self):
        sizeTraining = int(0.8 * len(self.data))
        size = len(self.data)
        self.data.pop(0)
        random.shuffle(self.data)
        self.trainingData = self.data[1:sizeTraining]
        self.testData = self.data[sizeTraining:size]

    def organizeTrainingData(self):
        for row in self.trainingData:
            self.titles.append(row[1])
            self.texts.append(row[2])
            self.sentiments.append(row[3])

    def addToArrayUnique(self, word, local):
        if word.lower() not in local and word.lower(
        ) not in DataManager.stopWords:
            finalWord = self.removeCharacters(word.lower())
            if finalWord != "":
                local.append(finalWord)

    def addToDictionary(self, words, local):
        for word in words:
            if local.has_key(word):
                local[word] = local[word] + 1
            else:
                local[word] = 1

    def separateTrainingWords(self):
        for i in range(0, len(self.texts)):
            wordsPerText = self.texts[i].split()
            for word in wordsPerText:
                self.addToArrayUnique(word, self.words)

            if self.sentiments[i] == '1':
                self.addToDictionary(self.words, self.badWords)
            elif self.sentiments[i] == '2':
                self.addToDictionary(self.words, self.neutralWords)
            elif self.sentiments[i] == '3':
                self.addToDictionary(self.words, self.goodWords)

            self.addToDictionary(self.words, self.countingWords)
            self.cleanData()

    def separateTestPhrases(self):
        a = 1
        dictionaryRow = {}
        dictionaryWords = {}
        for row in self.testData:
            if a == 2:
                dictionaryRow['titles'] = row[1]
                self.addToDictionary(self.separeteWords(row[2]),
                                     dictionaryWords)
                dictionaryRow['probabilityWords'] = dictionaryWords
                dictionaryRow['sentiments'] = row[3]
                self.phrases.append(dictionaryRow)
                dictionaryRow = {}
                dictionaryWords = {}
            else:
                a = 2

    def separeteWords(self, text):
        separatedWords = []
        wordsPerText = text.split()
        for word in wordsPerText:
            self.addToArrayUnique(word, separatedWords)

        return separatedWords

    def removeCharacters(self, word):

        # Unicode normalize transforma um caracter em seu equivalente em latin.
        # nfkd = unicodedata.normalize('NFKD', word)
        # palavraSemAcento = u''.join([c for c in nfkd if not unicodedata.combining(c)])

        # Usa expressão regular para retornar a palavra apenas com números, letras e espaço
        return re.sub('[^a-zA-Z0-9 \\\]', '', word)

    def cleanData(self):
        self.words = []
Пример #23
0
 def setUp(self):
     self.s = StopWords()
Пример #24
0
 def get_stopwords(self):
     lang = self.args.get('lang_code')
     return StopWords.get_stop_words(lang)
Пример #25
0
            grams = self.generate_ngrams(tokens, 3)
            grams.extend(self.generate_ngrams(tokens, 2))
            for word in grams:
                if word not in stop_tokens:
                    doc_grams.append(space_join(word))

        pattern = r'<VERB>?<ADV>*<VERB>+'
        doc = textacy.Doc(sentence, lang=model)
        lists = textacy.extract.pos_regex_matches(doc, pattern)
        verbs_list = []
        for l in lists:
            verb_tokens = l.lemma_.split()
            for verb in verb_tokens:
                if verb not in stop_tokens and self.is_valid_word(verb):
                    verbs_list.append(verb)
        return doc_grams, unigrams, verbs_list

    def generate_ngrams(self, tokens, n):
        return list(ngrams(tokens, n))


if __name__ == "__main__":
    a = 'How does the e-monies NEFT service differ from RGTS and EFT?'
    from StopWords import StopWords
    from StringProcessor import StringProcessor

    a = StringProcessor().normalize(a, 'en')
    en = StopWords.get_stop_words('en')
    cl = PhraseFinder()
    print(cl.find_phrases(a, en))
 def __init__(self):
   
     self.dictionary = {}
     self.stopWords = StopWords("E:/New folder (2)/IR assi/stop words.txt")
Пример #27
0
class Frequency():
    def __init__(self):
        self.collection = [['w1', 'w2', 'w4', 'w6'], ['w1', 'w2', 'w7', 'w3'],
                           ['w8', 'w5', 'w4', 'w5', 'w6']]
        self.dictionary = {}
        self.stopWords = StopWords(
            "D:/Information Retrieval/Assignment 2/stop words.txt")

    def loadDocuments(self):
        self.collection = []
        for i in range(1, 51):
            filename = "D:/Information Retrieval/Assignment 2/ShortStories/" + str(
                i) + ".txt"
            s = ""
            with open(filename) as f_obj:
                for line in f_obj:
                    if (line != '\n'):
                        l = re.sub('[^a-zA-Z0-9\s]|[\n]', '', line)
                        l = self.stopWords.removeWords(l.lower())
                        s = s + l.lower() + " "
#                        print(l.lower())
            lines = s.split(" ")
            self.collection.append(lines)

    def buildDictionary(self):
        for i in range(0, len(self.collection)):
            array = self.collection[i]

            for j in range(0, len(array)):
                if (array[j] not in self.dictionary):
                    docId = i + 1
                    d = {docId: 1}
                    self.dictionary[array[j]] = d
                else:
                    d = self.dictionary[array[j]]

                    if (i + 1) in d:
                        l = d[i + 1]
                        l = l + 1
                        d[i + 1] = l

                    else:
                        docId = i + 1
                        d[docId] = 1

                    self.dictionary[array[j]] = d

    def getTermFrequency(self, key):
        if key not in self.dictionary:
            return []
        return self.dictionary.get(key)

    def getDocumentFrequency(self, key):
        if key not in self.dictionary:
            return []
        return list(self.dictionary.get(key).keys())

    def getWords(self):
        return list(self.dictionary.keys())

    def getIdf(self, N):
        words = self.getWords()
        idf = [
            math.log10(N / len(self.getDocumentFrequency(x))) for x in words
        ]
        return idf