def __init__(self): self.collection = [['a', 'word', 'a', 'word', 'the'], ['the', 'a', 'brown', 'cat', 'the', 'a'], ['brown', 'cat', 'the', 'a', 'word']] self.dictionary = {} self.stopWords = StopWords( "D:/Information Retrieval/IR/stop words.txt")
class TestStopWords(unittest.TestCase): def setUp(self): self.s = StopWords() def testIsStopWord(self): self.assertTrue(self.s.is_stop_word('a')) def testIsStopWord2(self): self.assertFalse(self.s.is_stop_word('rare word'))
class Partition: def __init__(self, punctuation, data_inter_path, data_path): self.punctuation = set(punctuation) self.num_words = 0 self.sw = StopWords(data_path) os.chdir(data_inter_path) self.f = open('phrase_segments.txt', 'w') def split(self, sentence): new_sent = [None] * len(sentence) for i in xrange(len(sentence)): if sentence[i] in self.punctuation: new_sent[i] = ',' else: new_sent[i] = sentence[i] mining_sentence = "".join(new_sent).lower().split(',') sentence = sentence.split(",") for seg in mining_sentence: seg = seg.split() new_set = [] for word in seg: if not self.sw.isStopWord(word): new_set.append(word) seg = " ".join(new_set) seg = seg.strip() if seg: self.f.write(seg + "\n") return sentence
class Partition: def __init__(self, punctuation): self.punctuation = set(punctuation) self.num_words = 0 self.f = open('Intermediate/phrase_segments.txt','w') self.sw = StopWords() def split(self, sentence): new_sent = [None]*len(sentence) for i in xrange(len(sentence)): if sentence[i] in self.punctuation: new_sent[i] = ',' else: new_sent[i] = sentence[i] mining_sentence = "".join(new_sent).lower().split(',') sentence = sentence.split(",") for seg in mining_sentence: seg = seg.split() new_set = [] for word in seg: if not self.sw.isStopWord(word): new_set.append(word) seg = " ".join(new_set) seg = seg.strip() if seg: self.f.write(seg+"\n") return sentence
def __init__(self, path): self.Documents = [] self.allowed = set([chr(i) for i in xrange(ord('a'), ord('z')+1)]+ \ [chr(i) for i in xrange(ord('A'), ord('Z')+1)] + \ #[',','-',' '] + [str(i) for i in xrange(10)]) [',','.','?','-','!',' '] + [str(i) for i in xrange(10)]) self.punctuation = [';', ':', '&', '?', "/"] self.P = Partition(self.punctuation) self.tagger = PatternTagger() self.sw = StopWords() with open(path, 'r') as f: for line in f: line = line.strip() if line: self.Documents.append(line)
def __init__(self, path): data_home = os.path.split(path)[0] self.Documents = [] self.allowed = set([chr(i) for i in xrange(ord('a'), ord('z')+1)]+ \ [chr(i) for i in xrange(ord('A'), ord('Z')+1)] + \ #[',','-',' '] + [str(i) for i in xrange(10)]) [',','.','?','-','!',' '] + [str(i) for i in xrange(10)]) punctuation = [';', ':', '&', '?', "/"] #P = Partition(punctuation) self.tagger = PatternTagger() with open(path, 'r') as f: for line in f.readlines(): li = line.split("\t")[1].strip() if li: self.Documents.append(li) data_Inter_path = os.path.join(data_home, "Intermediate") self.inter = data_Inter_path self.P = Partition(punctuation, data_Inter_path, data_home) self.sw = StopWords(data_home)
class PositionalIndex(): def __init__(self): self.collection = [['a', 'word', 'a', 'word', 'the'], ['the', 'a', 'brown', 'cat', 'the', 'a'], ['brown', 'cat', 'the', 'a', 'word']] self.dictionary = {} self.stopWords = StopWords( "D:/Information Retrieval/IR/stop words.txt") def loadDocuments(self): self.collection = [] for i in range(1, 51): filename = "D:/Information Retrieval/IR/ShortStories/" + str( i) + ".txt" s = "" with open(filename) as f_obj: for line in f_obj: if (line != '\n'): l = re.sub('[^a-zA-Z0-9\s]|[\n]', '', line) l = self.stopWords.removeWords(l) s = s + l.lower() + " " lines = s.split(" ") self.collection.append(lines) def buildDictionary(self): for i in range(0, len(self.collection)): array = self.collection[i] for j in range(0, len(array)): if (array[j] not in self.dictionary): docId = i + 1 d = {docId: [j]} self.dictionary[array[j]] = d else: d = self.dictionary[array[j]] if (i + 1) in d: l = d[i + 1] l.append(j) d[i + 1] = l else: docId = i + 1 d[docId] = [j] self.dictionary[array[j]] = d def getPositionalIndex(self, key): if key not in self.dictionary: return [] return self.dictionary.get(key)
def __init__(self, path): self.Documents = [] self.allowed = set([chr(i) for i in xrange(ord('a'), ord('z')+1)]+ \ [chr(i) for i in xrange(ord('A'), ord('Z')+1)] + \ #[',','-',' '] + [str(i) for i in xrange(10)]) [',','.','?','-','!',' '] + [str(i) for i in xrange(10)]) self.punctuation = [';',':','&', '?', "/"] self.P = Partition(self.punctuation) self.tagger = PatternTagger() self.sw = StopWords() with open(path,'r') as f: for line in f: line = line.strip() if line: self.Documents.append(line)
def term_to_id(self, term0): term = Preprocessing.convert_word_to_normal_form(term0) term = Preprocessing.lemmatize(term) if not re.match(r'[a-zа-я]+$', term): return None if self.excluds_stopwords and StopWords.is_stop_word(term): return None try: term_id = self.vocas_id[term] except: term_id = len(self.vocas) self.vocas_id[term] = term_id self.vocas.append(term) self.docfreq.append(0) return term_id
class InvertedIndex(): def __init__(self): self.collection = [ ['a','word','a','word','the'], ['the', 'a', 'brown', 'cat', 'the', 'a'], ['brown', 'cat', 'the', 'a', 'word'] ] self.dictionary = {} self.stopWords = StopWords("D:/Information Retrieval/IR/stop words.txt") def loadDocuments(self): self.collection = [] for i in range(1, 51): filename = "D:/Information Retrieval/IR/ShortStories/"+str(i)+".txt" s = "" with open(filename) as f_obj: for line in f_obj: if(line != '\n'): l = re.sub('[^a-zA-Z0-9\s]|[\n]', '', line) l = self.stopWords.removeWords(l) s = s + l.lower() + " " lines = s.split(" ") self.collection.append(lines) def buildDictionary(self): for i in range(0 ,len(self.collection)): array = self.collection[i] for j in range(0,len(array)): if(array[j] not in self.dictionary): l = [] l.append(i+1) self.dictionary[array[j]] = l else: l = self.dictionary[array[j]] l.append(i+1) self.dictionary[array[j]] = l for key,value in self.dictionary.items(): self.dictionary[key] = list(set(value)) def getInvertedIndex(self, key): if key not in self.dictionary: return [] return self.dictionary.get(key)
def __init__(self, path): self.Documents = [] self.allowed = set( [chr(i) for i in xrange(ord("a"), ord("z") + 1)] + [chr(i) for i in xrange(ord("A"), ord("Z") + 1)] + # [',','-',' '] + [str(i) for i in xrange(10)]) [",", ".", "?", "-", "!", " "] + [str(i) for i in xrange(10)] ) self.punctuation = [";", ":", "&", "?", "/"] self.P = Partition(self.punctuation) self.tagger = PatternTagger() self.sw = StopWords() with open(path, "r") as f: for line in f: line = line.strip() if line: self.Documents.append(line)
def __init__(self, device='cpu', hyper_params=None): sup = super() sup.__init__(device=device, hyper_params=hyper_params) self.embeddings = nn.ModuleList([ sup.get_embeddings(key=key, device=device) for key in self.hyper_params['embeddings'] ]) emb_dim = sum([item.embedding_dim for item in self.embeddings]) self.hidden_size = emb_dim self.f_gru1 = nn.GRU(input_size=emb_dim, hidden_size=emb_dim, batch_first=True) self.b_gru1 = nn.GRU(input_size=emb_dim, hidden_size=emb_dim, batch_first=True) self.f_gru2 = nn.GRU(input_size=emb_dim, hidden_size=emb_dim, batch_first=True) self.b_gru2 = nn.GRU(input_size=emb_dim, hidden_size=emb_dim, batch_first=True) self.num_head = hyper_params['num_head'] self.attention = nn.ModuleList( [Attention(dimensions=emb_dim) for _ in range(self.num_head)]) self.dropout = nn.Dropout(hyper_params['dropout_ratio']) self.pooling = nn.AdaptiveAvgPool1d(1) self.output = nn.Linear(emb_dim + 1, hyper_params['num_class']) self.to(device) with Path('../data/utils/cheatsheet.txt').open( 'r', encoding='utf-8-sig') as f: self.cheatsheet = set([line.strip() for line in f.readlines()]) self.added_stop_words = StopWords(with_applied=True).get_instance() self.tokenizer = Tokenizer().get_instance()
def __init__(self, special_tokens={'<s>': 0, '<unk>': 1, '<pad>': 2, '<\s>': 3, '<mask>': 4}, with_del_stopwords=False, lower_count=0): if special_tokens is None: self.word2index = {'<unk>': 0, '<pad>': 1} self.current = 2 else: self.word2index = special_tokens self.current = len(special_tokens) self.index2word = {val: key for key, val in special_tokens.items()} self.vocab = set([key for key, val in special_tokens.items()]) self.sentence2indexes, self.indexes2sentence = SentenceIndexer().get_instance() self.padding_index = self.word2index['<pad>'] self.unknown_index = self.word2index['<unk>'] self.delim = ' ' self.counts = {} self.lower_count = lower_count self.max_length = 0 self.stop_words = StopWords().get_instance() self.text_processor = Tokenizer().get_instance() self.with_del_stopwords = with_del_stopwords
def __init__(self): self.collection = [['w1', 'w2', 'w4', 'w6'], ['w1', 'w2', 'w7', 'w3'], ['w8', 'w5', 'w4', 'w5', 'w6']] self.dictionary = {} self.stopWords = StopWords( "D:/Information Retrieval/Assignment 2/stop words.txt")
class Clean: def __init__(self, path): self.Documents = [] self.allowed = set([chr(i) for i in xrange(ord('a'), ord('z')+1)]+ \ [chr(i) for i in xrange(ord('A'), ord('Z')+1)] + \ #[',','-',' '] + [str(i) for i in xrange(10)]) [',','.','?','-','!',' '] + [str(i) for i in xrange(10)]) self.punctuation = [';', ':', '&', '?', "/"] self.P = Partition(self.punctuation) self.tagger = PatternTagger() self.sw = StopWords() with open(path, 'r') as f: for line in f: line = line.strip() if line: self.Documents.append(line) def is_number(self, s): try: float(s) return True except ValueError: return False def remove_stopwords(self, words, pos): new_sent = [] new_pos = [] for i in xrange(len(words)): if not self.sw.isStopWord(words[i]): new_sent.append(words[i]) new_pos.append(pos[i]) return new_sent, new_pos def replace_nums(self, s): sent = str(s) if sent[len(sent) - 1] == ".": sent = sent[0:len(sent) - 1] sent = sent.split() new_sent = [] for word in sent: if self.is_number(word): pass #new_sent.append("999999") else: new_sent.append(word) sent = " ".join(new_sent) return sent def remove_things(self, string): string = string.replace("\t", " ") string = string.replace(" and ", ", and ") new_string = [char for char in string if char in self.allowed] return "".join(new_string) def clean_and_tag(self): with open('Intermediate/full_sentences.txt', 'w') as f,\ open('Intermediate/full_pos.txt','w') as g,\ open('Intermediate/sentences.txt', 'w') as m,\ open('Intermediate/pos.txt', 'w') as n: for i in xrange(len(self.Documents)): if i % 10000 == 0 and i != 0: print str(i) + " documents processed." doc = self.Documents[i] cleaned_doc = self.remove_things(doc) blob = TextBlob(cleaned_doc) for j in xrange(len(blob.sentences)): sent = blob.sentences[j] sent = self.replace_nums(sent) split_sentence = self.P.split(sent) for k in xrange(len(split_sentence)): frag = split_sentence[k] sent_blob = TextBlob(frag, pos_tagger=self.tagger) words, pos = [], [] for word, tag in sent_blob.pos_tags: words.append(word) pos.append(tag) f.write( str(i) + ":" + str(j) + ":" + str(k) + ":" + (" ".join(words) + "\n")) g.write(" ".join(pos) + "\n") no_stop_words, no_stop_pos = self.remove_stopwords( words, pos) m.write( str(i) + ":" + str(j) + ":" + str(k) + ":" + (" ".join(no_stop_words) + "\n")) n.write(" ".join(no_stop_pos) + "\n")
def __init__(self, punctuation): self.punctuation = set(punctuation) self.num_words = 0 self.f = open('Intermediate/phrase_segments.txt','w') self.sw = StopWords()
def __init__(self, fh): self.stop_words = StopWords() self.__fh = fh self.__reader = TxtReader(fh) self.build_index()
class TxtIndex: def __init__(self, fh): self.stop_words = StopWords() self.__fh = fh self.__reader = TxtReader(fh) self.build_index() def build_index(self): self.keyword2pointers = {} self.__reader.seek(0) while True: word = self.__reader.nextWord() if word == None: break word = self.norm_word(word) if self.stop_words.is_stop_word(word): continue if word not in self.keyword2pointers: self.keyword2pointers[word] = [] self.keyword2pointers[word].append(self.__reader.startPointer()) def norm_word(self, word): word = word.lower() word = word.rstrip(string.punctuation) return word def norm_phrase(self, phrase): phrase = re.sub('\s+', ' ', phrase) return ' '.join([self.norm_word(x) for x in phrase.split(' ')]) def get_pointers(self, word): return self.keyword2pointers.get(self.norm_word(word)) def exact_search(self, phrase): phrase = self.norm_phrase(phrase) words = phrase.split(' ') if len(words) == 1: return self.get_pointers(words[0]) try_word = None try_word_idx = None try_word_pointers = [] for i in range(len(words)): word = words[i] if self.stop_words.is_stop_word(word): continue pointers = self.get_pointers(word) if pointers == None: return None if try_word == None or len(try_word_pointers) > len(pointers): try_word = word try_word_idx = i try_word_pointers = pointers extend_left_by = len(' '.join(words[0:i])) + len(words[0:i]) extend_right_by = len(' '.join(words[i:])) + len(words[i:]) phrase_re = re.compile(phrase.replace(' ', '\s+'), re.I) found = [] for pointer in try_word_pointers: s = pointer - extend_left_by l = extend_left_by + extend_right_by if s < 0: s = 0 self.__fh.seek(s) excerpt = self.__fh.read(l) m = phrase_re.search(excerpt) if m: found.append(s + m.start()) if len(found) > 0: return found else: return None def get_word_pointers(self, phrase): phrase = self.norm_phrase(phrase) words = phrase.split(' ') if len(words) == 1: return {words[0]: self.get_pointers(words[0])} word_pointers = {} for word in words: if word in word_pointers or self.stop_words.is_stop_word(word): continue word_pointers[word] = self.get_pointers(word) return word_pointers
def __init__(self, punctuation): self.punctuation = set(punctuation) self.num_words = 0 self.f = open('Intermediate/phrase_segments.txt', 'w') self.sw = StopWords()
def __init__(self, punctuation, data_inter_path, data_path): self.punctuation = set(punctuation) self.num_words = 0 self.sw = StopWords(data_path) os.chdir(data_inter_path) self.f = open('phrase_segments.txt', 'w')
class Clean: def __init__(self, path): self.Documents = [] self.allowed = set([chr(i) for i in xrange(ord('a'), ord('z')+1)]+ \ [chr(i) for i in xrange(ord('A'), ord('Z')+1)] + \ #[',','-',' '] + [str(i) for i in xrange(10)]) [',','.','?','-','!',' '] + [str(i) for i in xrange(10)]) self.punctuation = [';',':','&', '?', "/"] self.P = Partition(self.punctuation) self.tagger = PatternTagger() self.sw = StopWords() with open(path,'r') as f: for line in f: line = line.strip() if line: self.Documents.append(line) def is_number(self,s): try: float(s) return True except ValueError: return False def remove_stopwords(self, words, pos): new_sent = [] new_pos = [] for i in xrange(len(words)): if not self.sw.isStopWord(words[i]): new_sent.append(words[i]) new_pos.append(pos[i]) return new_sent,new_pos def replace_nums(self,s): sent = str(s) if sent[len(sent)-1] == ".": sent = sent[0:len(sent)-1] sent = sent.split() new_sent = [] for word in sent: if self.is_number(word): pass #new_sent.append("999999") else: new_sent.append(word) sent = " ".join(new_sent) return sent def remove_things(self, string): string = string.replace("\t", " ") string = string.replace(" and ", ", and ") new_string = [char for char in string if char in self.allowed] return "".join(new_string) def clean_and_tag(self): with open('Intermediate/full_sentences.txt', 'w') as f,\ open('Intermediate/full_pos.txt','w') as g,\ open('Intermediate/sentences.txt', 'w') as m,\ open('Intermediate/pos.txt', 'w') as n: for i in xrange(len(self.Documents)): if i%10000 == 0 and i!=0: print str(i)+" documents processed." doc = self.Documents[i] cleaned_doc = self.remove_things(doc) blob = TextBlob(cleaned_doc) for j in xrange(len(blob.sentences)): sent = blob.sentences[j] sent = self.replace_nums(sent) split_sentence = self.P.split(sent) for k in xrange(len(split_sentence)): frag = split_sentence[k] sent_blob = TextBlob(frag, pos_tagger=self.tagger) words, pos = [],[] for word,tag in sent_blob.pos_tags: words.append(word) pos.append(tag) f.write(str(i)+":"+str(j)+":"+str(k)+":"+(" ".join(words)+"\n")) g.write(" ".join(pos)+"\n") no_stop_words, no_stop_pos = self.remove_stopwords(words,pos) m.write(str(i)+":"+str(j)+":"+str(k)+":"+(" ".join(no_stop_words)+"\n")) n.write(" ".join(no_stop_pos)+"\n")
class DataManager: data = [] trainingData = [] testData = [] stopWords = StopWords().list #Training Data titles = [] texts = [] sentiments = [] words = [] countingWords = {} badWords = {} neutralWords = {} goodWords = {} #Test Data phrases = [] textsTest = [] sentimentsTest = [] def __init__(self): self.data = self.getData() self.separateData() self.organizeTrainingData() self.separateTrainingWords() self.separateTestPhrases() def getData(self): read = [] with open('chennai.csv', 'r') as csvfile: readCSV = csv.reader(csvfile, delimiter=';') for row in readCSV: read.append(row) #random.shuffle(read) return read def separateData(self): sizeTraining = int(0.8 * len(self.data)) size = len(self.data) self.data.pop(0) random.shuffle(self.data) self.trainingData = self.data[1:sizeTraining] self.testData = self.data[sizeTraining:size] def organizeTrainingData(self): for row in self.trainingData: self.titles.append(row[1]) self.texts.append(row[2]) self.sentiments.append(row[3]) def addToArrayUnique(self, word, local): if word.lower() not in local and word.lower( ) not in DataManager.stopWords: finalWord = self.removeCharacters(word.lower()) if finalWord != "": local.append(finalWord) def addToDictionary(self, words, local): for word in words: if local.has_key(word): local[word] = local[word] + 1 else: local[word] = 1 def separateTrainingWords(self): for i in range(0, len(self.texts)): wordsPerText = self.texts[i].split() for word in wordsPerText: self.addToArrayUnique(word, self.words) if self.sentiments[i] == '1': self.addToDictionary(self.words, self.badWords) elif self.sentiments[i] == '2': self.addToDictionary(self.words, self.neutralWords) elif self.sentiments[i] == '3': self.addToDictionary(self.words, self.goodWords) self.addToDictionary(self.words, self.countingWords) self.cleanData() def separateTestPhrases(self): a = 1 dictionaryRow = {} dictionaryWords = {} for row in self.testData: if a == 2: dictionaryRow['titles'] = row[1] self.addToDictionary(self.separeteWords(row[2]), dictionaryWords) dictionaryRow['probabilityWords'] = dictionaryWords dictionaryRow['sentiments'] = row[3] self.phrases.append(dictionaryRow) dictionaryRow = {} dictionaryWords = {} else: a = 2 def separeteWords(self, text): separatedWords = [] wordsPerText = text.split() for word in wordsPerText: self.addToArrayUnique(word, separatedWords) return separatedWords def removeCharacters(self, word): # Unicode normalize transforma um caracter em seu equivalente em latin. # nfkd = unicodedata.normalize('NFKD', word) # palavraSemAcento = u''.join([c for c in nfkd if not unicodedata.combining(c)]) # Usa expressão regular para retornar a palavra apenas com números, letras e espaço return re.sub('[^a-zA-Z0-9 \\\]', '', word) def cleanData(self): self.words = []
def setUp(self): self.s = StopWords()
def get_stopwords(self): lang = self.args.get('lang_code') return StopWords.get_stop_words(lang)
grams = self.generate_ngrams(tokens, 3) grams.extend(self.generate_ngrams(tokens, 2)) for word in grams: if word not in stop_tokens: doc_grams.append(space_join(word)) pattern = r'<VERB>?<ADV>*<VERB>+' doc = textacy.Doc(sentence, lang=model) lists = textacy.extract.pos_regex_matches(doc, pattern) verbs_list = [] for l in lists: verb_tokens = l.lemma_.split() for verb in verb_tokens: if verb not in stop_tokens and self.is_valid_word(verb): verbs_list.append(verb) return doc_grams, unigrams, verbs_list def generate_ngrams(self, tokens, n): return list(ngrams(tokens, n)) if __name__ == "__main__": a = 'How does the e-monies NEFT service differ from RGTS and EFT?' from StopWords import StopWords from StringProcessor import StringProcessor a = StringProcessor().normalize(a, 'en') en = StopWords.get_stop_words('en') cl = PhraseFinder() print(cl.find_phrases(a, en))
def __init__(self): self.dictionary = {} self.stopWords = StopWords("E:/New folder (2)/IR assi/stop words.txt")
class Frequency(): def __init__(self): self.collection = [['w1', 'w2', 'w4', 'w6'], ['w1', 'w2', 'w7', 'w3'], ['w8', 'w5', 'w4', 'w5', 'w6']] self.dictionary = {} self.stopWords = StopWords( "D:/Information Retrieval/Assignment 2/stop words.txt") def loadDocuments(self): self.collection = [] for i in range(1, 51): filename = "D:/Information Retrieval/Assignment 2/ShortStories/" + str( i) + ".txt" s = "" with open(filename) as f_obj: for line in f_obj: if (line != '\n'): l = re.sub('[^a-zA-Z0-9\s]|[\n]', '', line) l = self.stopWords.removeWords(l.lower()) s = s + l.lower() + " " # print(l.lower()) lines = s.split(" ") self.collection.append(lines) def buildDictionary(self): for i in range(0, len(self.collection)): array = self.collection[i] for j in range(0, len(array)): if (array[j] not in self.dictionary): docId = i + 1 d = {docId: 1} self.dictionary[array[j]] = d else: d = self.dictionary[array[j]] if (i + 1) in d: l = d[i + 1] l = l + 1 d[i + 1] = l else: docId = i + 1 d[docId] = 1 self.dictionary[array[j]] = d def getTermFrequency(self, key): if key not in self.dictionary: return [] return self.dictionary.get(key) def getDocumentFrequency(self, key): if key not in self.dictionary: return [] return list(self.dictionary.get(key).keys()) def getWords(self): return list(self.dictionary.keys()) def getIdf(self, N): words = self.getWords() idf = [ math.log10(N / len(self.getDocumentFrequency(x))) for x in words ] return idf