class BagOfWordsFeatureBooleanizer(FeatureBooleanizer): def __init__(self, featureName, featuresData, featureId): FeatureBooleanizer.__init__(self, featureName, featuresData, featureId) self.stemmer = Stemmer('english') self.goodChars = frozenset('abcdefghjiklmnopqrstuvwxyz0123456789') stopListFn = './resources/general/stopword.csv' self.stopList = frozenset(l for l in filter(None, map(lambda l: self.preprocess(l), open(stopListFn, 'rt').readlines()))) allWords = set() if self.featureName == 'Basic: Tagline': for row in featuresData: allWords |= set(map(lambda w: self.preprocess(w), filter(None, row[featureId].split(',')))) else: for row in featuresData: allWords |= set(map(lambda w: self.preprocess(w), filter(None, row[featureId].split()))) self.words = sorted(list(filter(None, allWords - self.stopList))) def preprocess(self, s): chars = [] for c in unidecode(s.strip().lower()): if c in self.goodChars: chars.append(c) word = ''.join(chars) return self.stemmer.stemWord(word) def getFeatureNames(self): return [self.featureName + ': ' + word for word in self.words] def process(self, v): vWords = set(map(lambda w: self.preprocess(w), filter(None, v.split(',')))) return [(word in vWords) for word in self.words]
def stemmer(tokens): # ps = PorterStemmer() # tokens = [ps.stem(w) for w in tokens] ps = Stemmer('porter') tokens = [ps.stemWord(w) for w in tokens] return tokens
def run(): stemmer = Stemmer("english") pages = db.en.find() print colored.yellow("statistic words") wordstatistic = {} for page in progress.bar(pages,size=db.en.count()): data = page.get("data") if not data:continue content = data.get("content") if not content: db.en.remove({"_id":page["_id"]}) continue words = EN_WORD_CUT.split(content) for word in words: w=stemmer.stemWord(word.strip()).lower() if w and len(w)<20 and not w in EN_IGNORE: if wordstatistic.get(w): wordstatistic[w]+=1 else: wordstatistic[w]=1 print colored.yellow("save to en_words_freq") savequene = [] for k,v in progress.bar(wordstatistic.iteritems(),size=len(wordstatistic)): savequene.append({"_id":k,"freq":v}) if len(savequene) >=1000: db.en_words_freq.insert(savequene) savequene=[] if savequene:db.en_words_freq.insert(savequene) print colored.cyan( "count of en_words_freq: %d" % db.en_words_freq.count())
class StemProvider(Provider): """Stem the input values (either a single word or a list of words) Uses the porter stemmer algorithm. """ def __init__(self, language='english', **kwargs): """ See here for a full list of languages: http://nltk.org/_modules/nltk/stem/snowball.html .. note:: This does not depend on nltk, it depends on the ``pystemmer`` package. :param language: language to use during stemming, defaults to english. """ Provider.__init__(self, **kwargs) self._stemmer = Stemmer(language) def do_process(self, input_value): if isinstance(input_value, str): return self._stemmer.stemWord(input_value) else: return self._stemmer.stemWords(input_value)
class BagOfWordsFeatureSupport(FeatureSupport): def __init__(self, featuresData, featureId): FeatureSupport.__init__(self, featuresData, featureId) self.stemmer = Stemmer('english') self.goodChars = frozenset('abcdefghjiklmnopqrstuvwxyz0123456789') stopListFn = './resources/general/stopword.csv' self.stopList = frozenset(l for l in filter(None, map(lambda l: self.preprocess(l), open(stopListFn, 'rt').readlines()))) def preprocess(self, s): chars = [] for c in unidecode(s.strip().lower()): if c in self.goodChars: chars.append(c) word = ''.join(chars) return self.stemmer.stemWord(word) def extract(self, i): bag = frozenset(map(lambda w: self.preprocess(w), filter(None, self[i].split()))) ret = bag - self.stopList if len(ret) == 0: ret = frozenset([''.join(random.choice('abcdefghjiklmnopqrstuvwxyz') for _ in range(20))]) return ret def similarity(self, a, b): num = len(a & b) den = len(a | b) return num / den if den != 0 else 1.0
def getStems(cleanedText, stopWords): stems = {} matches = re.finditer(r'\w+(\.?\w+)*', cleanedText.strip(), flags=re.IGNORECASE) stemmer = Stemmer('english') #maxlength = sum(1 for _ in matches1) #stemmer.maxCacheSize = maxlength offset = len(termDict) tokenid = offset + 1 position = 0 for match in matches: #position = match.start() position += 1 token = match.group() filteredToken = filterToken(token, stopWords) if filteredToken and filteredToken is not None: wordStem = stemmer.stemWord(filteredToken.lower()) #present = wordStem in stems if wordStem not in stems: #tokenid += 1 stems[wordStem] = tokenid positions = set() positions.add(position) if wordStem not in termDict: termDict[wordStem] = tokenid terms[tokenid] = positions tokenid = tokenid + 1 else: stemid = termDict[wordStem] terms[stemid] = positions else: stemid = termDict[wordStem] postns = terms[stemid] postns.add(position) terms[stemid] = postns
def tokenise(value, identifier, category, content_stop): token_list = [] final_list = [] value = re.sub(exclude1, " ", value) value = re.sub(exclude2, " ", value) value = re.sub(r'[^a-zA-Z]', " ", value) value = value.lower() if category == 'e': value = re.sub(r'(http|www|com)', " ", value) if category == 'c': value = re.sub(r'category', " ", value) token_list = value.split() for w in token_list: if w not in content_stop.keys(): final_list.append(w) # stemmer = PorterStemmer() stemmer = Stemmer("english") final_list = [stemmer.stemWord(key) for key in final_list] # final_list = [stemmer.stem(plural,0, len(plural)-1) for plural in final_list] if final_list: #call next function here. return (final_list) ####after work of token_list is done#### token_list = [] final_list = []
def textHandler(text): #print(text) #stop_word = {} #tokenizing text = text.encode('ascii', errors='ignore').decode() text = re.sub(r'[^A-Za-z0-9]+', r' ', text) #tokens = nltk.word_tokenize(text)#tokenizing #stop word removal #uwords = [word for word in tokens if word not in stop_word.keys()]#stop word removal #print('remove',uwords) stop_words = set(stopwords.words('english')) word_tokens = word_tokenize(text) filter_sentence = [w for w in word_tokens if not w in stop_words] # filter_sentence = [] # for w in word_tokens: # if w not in stop_words: # filter_sentence.append(w) stemmer = Stemmer('porter') stem_text = [] for word in filter_sentence: stem_text.append(stemmer.stemWord(word)) #print(filter_sentence) # print('before',len(filter_sentence)) # print('after',len(stemming(stem_text))) return stem_text
def stem(datalist): #Stemming stemmer = Stemmer("english") tmp = [] for x in datalist: y = stemmer.stemWord(x) tmp.append(y) return tmp
class Stemmer(object): def __init__(self): # type: () -> None self.stemmer = PyStemmer('porter') def stem(self, word): # type: (unicode) -> unicode return self.stemmer.stemWord(word)
def apply_snowball_stemmer(tagged_sentences: List[List[Tuple]], stemmer: Stemmer) -> List[List[Tuple]]: stemmed_sentences = [] for sentence in tagged_sentences: stemmed = [] for pos, text_repr, surface_repr, tag in sentence: word = surface_repr.lower() if tag in [CONTENT_WORD_TAG, STOPWORD_TAG] else surface_repr stemmed.append((pos, text_repr, surface_repr, stemmer.stemWord(word), tag)) stemmed_sentences.append(stemmed) return stemmed_sentences
def make_index(expression): """ Make a standardization in the expression to return a tuple who maximise maching possibilities. expression must be a list or tuple """ stemmer = Stemmer("french") expression = [stemmer.stemWord(normalize_token(w)) for w in expression] expression.sort() return tuple(expression)
def processQueries(queries): queryList = [] for query in queries: filteredQuery = tokenize.filterToken(query, tokenize.getStopWords()) if filteredQuery and filteredQuery is not None: stemmer = Stemmer('english') queryStem = stemmer.stemWord(filteredQuery.lower()) queryList.append(queryStem) return queryList
def parse_html(html): words = dehtml(html) s = Stemmer("danish") result = [] for w in words.split(): word = w.lower() if word in stop_words or len(word) < 2 or word.count('\\'): continue result.append(s.stemWord(word)) return result
def getTerm(term): term_ids = {} term_ids_file = open(TERMIDSFILE, 'rU') for line in term_ids_file.readlines(): pieces = line.strip().split('\t') stemmer = Stemmer('english') #stemmer.maxCacheSize = 1 termStem = stemmer.stemWord(term.lower()) if termStem == pieces[1]: term_ids[pieces[1]] = int(pieces[0]) return term_ids term_ids_file.close() return term_ids
def cleanQuery(data): global StopWords, Stemmer, extension data = data.lower() data = re.sub(r'<(.*?)>', '', data) # Remove HTML Tags data = re.sub( r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', data, flags=re.MULTILINE) # Remove Url data = re.sub('[^A-Za-z0-9]+', ' ', data) # Remove Special characters token_list = word_tokenize(data) # Tokenize String token_list = [ word for word in token_list if word not in StopWords and word not in extension ] # Remove StopWords and Extended StopWords tokens_list = [Stemmer.stemWord(word) for word in token_list] return tokens_list
class TextEater(object): def __init__(self): self.stoplist = gen_stops() self.stemmer = Stemmer('english') @coroutine def sent_filter(self,target): word = '' print "ready to eat lines" while True: sentence = (yield) target.send((sentence.lower()).split()) @coroutine def word_filter(self, target): print "ready to eat words" while True: raw = (yield) target.send([self.stemmer.stemWord(w) for w in raw if len(w)<=3 or w in self.stoplist]) @coroutine def ngrams(self,container, n=2,): "Compute n-grams" while True: grams= (yield) for i in range(0, len((grams)) - (n - 1)): container[(tuple(grams[i:i+n]))]+=1 @coroutine def printer(self): while True: line = (yield) print (line) @coroutine def typer(self,target): print "ready to check type" word = None while True: line = (yield word) word= type(line)
def cleanData(data): global StopWords, Stemmer, total_count data = re.sub(r'<(.*?)>', '', data) # Remove HTML Tags data = re.sub( r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', data, flags=re.MULTILINE) #Remove Url data = re.sub('[^A-Za-z0-9]+', ' ', data) # Remove Punctuation and Special Characters token_list = word_tokenize(data) # Tokenize String total_count += len(token_list) token_list = [ word for word in token_list if word not in StopWords and word not in extension ] #Remove Stopwords and Extended Stopwords token_list = [Stemmer.stemWord(word) for word in token_list] #Stem words return token_list
def index(text, accepted_languages=None, langs=None): registry = get_current_registry() if accepted_languages == None: accepted_languages = [x.strip() for x in registry.settings["accepted_languages"].split("," )] if langs == None: lang = guessLanguage(text) if lang not in accepted_languages: langs = accepted_languages else: langs = [lang] langs = list(set(langs).intersection(set(accepted_languages))) if not langs: langs = accepted_languages indexed_words = set() for lang in langs: stemmer = Stemmer(lang) indexed_words.update([stemmer.stemWord(x.value) for x in tokenize(text)]) return indexed_words
def index(text, accepted_languages=None, langs=None): registry = get_current_registry() if accepted_languages == None: accepted_languages = [ x.strip() for x in registry.settings["accepted_languages"].split(",") ] if langs == None: lang = guessLanguage(text) if lang not in accepted_languages: langs = accepted_languages else: langs = [lang] langs = list(set(langs).intersection(set(accepted_languages))) if not langs: langs = accepted_languages indexed_words = set() for lang in langs: stemmer = Stemmer(lang) indexed_words.update( [stemmer.stemWord(x.value) for x in tokenize(text)]) return indexed_words
class Overview(Feature): description = """ Basic: Overview """.strip() def __init__(self, *args, **kwargs): Feature.__init__(self) self.stemmer = Stemmer('english') self.goodChars = frozenset('abcdefghjiklmnopqrstuvwxyz0123456789') self.stopList = frozenset(['a', 'abaft', 'aboard', 'about', 'abov', 'absent', 'accord', 'account', 'across', 'addit', 'afor', 'after', 'against', 'ago', 'ahead', 'all', 'along', 'alongsid', 'alreadi', 'also', 'am', 'amid', 'amidst', 'among', 'amongst', 'an', 'and', 'anenst', 'ani', 'anoth', 'anybodi', 'anyhow', 'anyon', 'anyth', 'anywher', 'apart', 'apr', 'april', 'apropo', 'apud', 'are', 'around', 'as', 'asid', 'astrid', 'at', 'athwart', 'atop', 'aug', 'august', 'back', 'bad', 'bar', 'be', 'becaus', 'been', 'befor', 'begin', 'behalf', 'behest', 'behind', 'below', 'beneath', 'besid', 'best', 'better', 'between', 'beyond', 'big', 'bigger', 'biggest', 'billion', 'blah', 'bln', 'both', 'but', 'by', 'c', 'ca', 'call', 'can', 'cannot', 'cant', 'case', 'circa', 'close', 'concern', 'could', 'couldt', 'current', 'daili', 'day', 'dec', 'decemb', 'despit', 'did', 'do', 'doe', 'doesnt', 'done', 'dont', 'down', 'due', 'dure', 'each', 'eight', 'eighteen', 'eighth', 'eighti', 'eleven', 'end', 'enough', 'ever', 'except', 'exclud', 'fail', 'far', 'feb', 'februari', 'few', 'fifth', 'first', 'five', 'fiveteen', 'fivti', 'follow', 'for', 'forenenst', 'four', 'fourteen', 'fourth', 'fourti', 'fri', 'friday', 'from', 'front', 'full', 'further', 'get', 'given', 'go', 'gone', 'goot', 'had', 'hadnt', 'has', 'hasnt', 'have', 'havent', 'he', 'her', 'here', 'herself', 'high', 'higher', 'hightst', 'himself', 'his', 'how', 'hunderd', 'i', 'if', 'in', 'includ', 'insid', 'instead', 'into', 'is', 'it', 'itself', 'jan', 'januari', 'jul', 'juli', 'jun', 'june', 'just', 'last', 'late', 'later', 'latest', 'left', 'lest', 'lieu', 'like', 'littl', 'long', 'low', 'lower', 'lowest', 'made', 'make', 'mani', 'mar', 'march', 'may', 'me', 'mean', 'mid', 'midst', 'might', 'milliard', 'million', 'mine', 'minus', 'mld', 'mln', 'modulo', 'mon', 'monday', 'month', 'more', 'most', 'mth', 'much', 'must', 'my', 'myself', 'near', 'need', 'neednt', 'neither', 'never', 'next', 'nine', 'nineteen', 'nineth', 'nineti', 'no', 'none', 'nor', 'not', 'notwithstand', 'nov', 'novemb', 'number', 'o', 'oct', 'octob', 'of', 'off', 'on', 'one', 'onli', 'onto', 'oppos', 'opposit', 'or', 'order', 'other', 'ought', 'our', 'ourselv', 'out', 'outsid', 'over', 'owe', 'pace', 'past', 'per', 'place', 'plus', 'point', 'previous', 'prior', 'pro', 'pursuant', 'put', 'qua', 'rather', 'recent', 'regard', 'regardless', 'respect', 'right', 'round', 'said', 'sake', 'same', 'san', 'sat', 'saturday', 'save', 'saw', 'say', 'second', 'see', 'seen', 'sep', 'septemb', 'seven', 'seventeen', 'seventh', 'seventi', 'sever', 'shall', 'she', 'should', 'shouldnt', 'show', 'shown', 'sinc', 'six', 'sixteen', 'sixth', 'sixti', 'small', 'smaller', 'smallest', 'so', 'some', 'somebodi', 'somehow', 'someon', 'someth', 'somewher', 'soon', 'sooner', 'spite', 'start', 'still', 'subsequ', 'such', 'sun', 'sunday', 'take', 'taken', 'tell', 'ten', 'tenth', 'than', 'thank', 'that', 'the', 'their', 'them', 'themselv', 'there', 'these', 'they', 'third', 'thirteen', 'thirti', 'this', 'those', 'thousand', 'three', 'through', 'throughout', 'thru', 'thruout', 'thu', 'thursday', 'till', 'time', 'to', 'today', 'told', 'too', 'took', 'top', 'toward', 'tue', 'tuesday', 'twelv', 'twenti', 'two', 'under', 'underneath', 'unit', 'unlik', 'until', 'unto', 'up', 'upon', 'us', 'use', 'versus', 'via', 'vice', 'view', 'virtu', 'vis', 'visavi', 'vs', 'was', 'we', 'wed', 'wednesday', 'week', 'well', 'went', 'were', 'what', 'when', 'where', 'whether', 'whi', 'which', 'while', 'who', 'whose', 'will', 'with', 'within', 'without', 'wont', 'wors', 'worst', 'worth', 'would', 'wrt', 'xor', 'year', 'yes', 'yesterday', 'yet', 'you', 'your', 'yourself', 'yourselv', 'yr']) def preprocess(self, s): chars = [] for c in unidecode(s.strip().lower()): if c in self.goodChars: chars.append(c) word = ''.join(chars) return self.stemmer.stemWord(word) def extract(self, m): t = m.overview return ','.join(sorted(list(set(filter(lambda w: len(w) > 0 and w not in self.stopList, map(self.preprocess, t.split()))))))
def nonField_query(path, text, secondary_index_list): #print(1) text = text.lower() text = text.encode('ascii', errors='ignore').decode() text = re.sub(r'[^A-Za-z0-9]+', r' ', text) stop_words = set(stopwords.words('english')) word_tokens = word_tokenize(text) filter_sentence = [w for w in word_tokens if not w in stop_words] # filter_sentence = [] # for w in word_tokens: # if w not in stop_words: # filter_sentence.append(w) stemmer = Stemmer('porter') stem_text = [] for word in filter_sentence: stem_text.append(stemmer.stemWord(word)) #print(word) result_list = [] #print(stem_text) for word in stem_text: result_list.append(Posting(secondary_index_list, word, path)) return result_list
class Stemmer(object): def __init__(self): self.stemmer = PyStemmer('porter') def stem(self, word): return self.stemmer.stemWord(word)
os.remove(pathOfFolder + f_name + str(i)) else: listOfWords[i] = topOfFile[i].split(':') if listOfWords[i][0] not in heap: heapq.heappush(heap, listOfWords[i][0]) writeIntoFile(tag_index, pathOfFolder, data, countFinalFile) ############################################################################# if os.path.exists(os.path.join(absltPthCurrPrgrm, 'stopwords.txt')): with open(os.path.join(absltPthCurrPrgrm, 'stopwords.txt'), 'r') as file: words = file.read().split('\n') # stem the stop word for word in words: word = ps.stemWord(word) if word: stopwords[word] = 1 else: print("stopwords.txt does not exist in the directory") sys.exit() documentcount = 0 ########################################################################### # parse the documents for event, elem in etree.iterparse(pathWikiXML, events=('start', 'end')): tag_name = strip_tag_name(elem.tag) # finished extracting all the text in the page tag. if (tag_name == 'page') and (event == 'end'): documentcount += 1
class StemCorpus(Corpus): def __init__(self): super().__init__() self.stemmer = Stemmer('russian') def __getstate__(self): return self.word_to_idx, self.idx_to_word def __setstate__(self, state): self.stemmer = Stemmer('russian') self.word_to_idx, self.idx_to_word = state def encode_word(self, word): stem_form = self.stemmer.stemWord(word.lower()) return self.word_to_idx.get(stem_form, len(self.idx_to_word) - 1) def build(self, sentences, vocabulary_size=50000, log_every=100000): print('= Start building vocabulary') vocab = defaultdict(int) saved_sentences = [] for i, s in enumerate(sentences, 1): line = s.lower().split() for tok in line: if tok in PUNKT_TAGS: continue stem_form = self.stemmer.stemWord(tok.lower()) vocab[stem_form] += 1 if i % log_every == 0: print('--- Processed {} sentences'.format(i)) saved_sentences.append(line) print('= Built vocabulary with size {}'.format(len(vocab))) if vocabulary_size < len(vocab): print('= Trim it to {}'.format(vocabulary_size)) word_freq = list( map(itemgetter(0), sorted(vocab.items(), key=_freq_sorter, reverse=True))) word_freq = word_freq[:vocabulary_size] print('Top 10 most frequent words: {}'.format(', '.join( word_freq[:10]))) print('Top 10 least frequent words: {}'.format(', '.join( word_freq[-10:]))) print('= Building word to index mapping') if Tag.NUM not in word_freq: word_freq[-2] = Tag.NUM if Tag.ENG not in word_freq: word_freq[-1] = Tag.ENG assert Tag.EOS not in word_freq word_freq.append(Tag.EOS) assert Tag.UNK not in word_freq word_freq.append(Tag.UNK) self.idx_to_word.clear() self.word_to_idx.clear() for w in word_freq: self.word_to_idx[w] = len(self.idx_to_word) self.idx_to_word.append(w) print('= Built mappings') print('idx_to_word size = {}, word_to_idx size = {}'.format( len(self.idx_to_word), len(self.word_to_idx)))
class Searcher: def __init__(self): self.lexicon = {} #lexicon for assisting in search self.titles = {} #document titles self.stop_words = {} self.stemmer = Stemmer("english") # for stemming of words self.totalDocs = 127467 # total counts of all pages found in our document ( please update this count according to your dataset) self.load() # loading all the files and writing them to respective dictionaries def load(self): self.loadLexicon("Lexicon.txt") self.loadTitles("titles.txt") self.loadStopWords("Stop_words.txt") def loadLexicon(self, path): try: lexiconFile = open(path, 'r') for line in lexiconFile: x = line[:-1].split("-") self.lexicon[int(x[0])] = int(x[1]) except: print("Error opening lexicon file") sys.exit(0) def loadTitles(self , path): try: titleFile = open(path, 'r',encoding="UTF-8") for line in titleFile: x= line[:-1].split("-") self.titles[int(x[0])] = x[1] except: print("Error opening titles file") sys.exit(0) def loadStopWords(self, path): try: stop_words_file = open(path, 'r') content = stop_words_file.read() content = re.split(",", content) for word in content: if word: self.stop_words[word] = True except: print("Error opening stop words file") sys.exit(0) # method to intersect lists def intersectLists(self, lists): if len(lists) == 0: return [] # start intersecting from the smaller list lists.sort(key=len) c = lists[0] for x in lists[1:]: c = list(set(c) & set(x)) return c #method to get document titles for document ids def getDocTitles(self, docIds): docTitles = [] for y in docIds: title = self.titles.get(y) if title != None: docTitles.append(title) return docTitles #method to process and organize raw hitlist from index def processRawHitlist(self, hitlists): parentArr=[] # master array to contain all categories of hits parentArr.append({}) # title hits dictionary parentArr.append({}) # subTitle hits dictionary parentArr.append({}) # category hits dictionay parentArr.append({}) # text hit dictionary # splitting the hitlist and recording hits in parentArr hitlists = hitlists[:-1].split("|")[1].split("/") for singleList in hitlists: singleDocumentList = singleList.split("-") docId = int(singleDocumentList[0]) for smallerLists in singleDocumentList[1].split(","): a = smallerLists.split(".") type = int(a[0]) # the category or type of hit(title, text,etc) pos = int(a[1]) if docId not in parentArr[type]: parentArr[type][docId] = [pos] else: parentArr[type][docId].append(pos) return parentArr # method to return final processed hitlist of a word def getHitlist(self, word): word = word.lower() wordId = zlib.crc32(word.encode("UTF-8")) # getting word id bWordId = wordId.to_bytes(4, byteorder="big", signed=False) self.word_file = bWordId[0] # getting the file containing the word off = self.lexicon.get(wordId) # getting word pointer in index if off != None: f = open("SortedIndex/" + str(bWordId[0]) + ".txt", "r") f.seek(off) y = f.readline() # reading raw hitlist f.close() return self.processRawHitlist(y) else: return [] # method to return all docs that contain the given words without catering for proximity def getUnproximatedDocs(self, wordsList,type): docIds = [] for arr in wordsList: if arr != []: arr = arr[type] docIds.append(arr.keys()) docs = self.intersectLists(docIds) return docs # method to do return phrase results in a particular type def getResultsForPhrase(self,wordsList, type): termDocsCount = 0 docs = {} unproximatedDocs = self.getUnproximatedDocs(wordsList,type) # converting unproximated docs to proximated for docId in unproximatedDocs: proximityArr = [] for i, arr in enumerate(wordsList): if arr != []: arr = arr[type] #getting hitlist for a particular type of hits poss = arr.get(docId) proximityArr.append([pos - i for pos in poss]) # subtracting n from positions of a word in document to bring them on a common line t = self.intersectLists(proximityArr) #intersecting positions to find phrases in documents if (len(t) > 0): tf = len(t) # term frequency would be the length of insersection result docs[docId] = tf #recording term frequency of each document termDocsCount +=1 if type == 3: return self.rankDocs(docs,termDocsCount) # ranking the results else: return docs.keys() def rankDocs(self,docs,termDocsCount): y = docs #calculating the inverse document frequency if termDocsCount != 0: x = self.totalDocs/termDocsCount ifd = math.log2(x) # calculating tf-idf score for each document for x in docs.keys(): docs[x] = docs[x]*ifd # returning sorted array based of tf-tdf values of documents x = sorted(docs.items(), key=lambda kv: kv[1],reverse=True) y = [a[0] for a in x] return y # append two arrays def appendResults(self,results,moreResults): for x in moreResults: if x not in results: results.append(x) return results # get results for a single word query def getResultsForWord(self,wordHitlist,type): docs={} typeArr = wordHitlist[type] #getting hitlists for particular type of hit for doc in typeArr: docs[doc] = len(typeArr.get(doc)) # recording tf of documents relative to query term termDocsCount = len(typeArr) if type == 3 and termDocsCount != 0: #ranking title,categories,subtitles has no significant benefits x = self.totalDocs / termDocsCount ifd = math.log2(x) for x in docs.keys(): docs[x] = docs[x] * ifd #finding tf- idf scores x = sorted(docs.items(), key=lambda kv: kv[1], reverse=True) # sorting by tf-idf scores y = [a[0] for a in x] return y else: return [x for x in docs.keys()] # method to do single word query on words of a phrase query and return results with a certain order def getMoreResults(self,singleWordResults , count): docs = [] for type in range(4): a = [] maxCount = -1 for i in range(len(singleWordResults)): v = singleWordResults[i].get(type) if v == None: continue if len(v) > maxCount: maxCount = len(v) a.append(v) for j in range(maxCount): for i in range(len(a)): if j < len(a[i]): docs.append(a[i][j]) if len(docs) == count: return docs # method to do one word query def oneWordQuery(self,word,mode): hitlist = self.getHitlist(word) if hitlist == []: return {} else: if mode: # either an atomic single word query or a single word query on terms of a phrase word query results = {} #dictionary results for query on terms of phrase query to allow order titleDoc = self.getResultsForWord(hitlist,0) results[0] = titleDoc # results for title hits subTitleDoc = self.getResultsForWord(hitlist,1) results[1] = subTitleDoc # results for sub title hits categoryDoc = self.getResultsForWord(hitlist, 2) results[2] = categoryDoc # results for category hits textDoc = self.getResultsForWord(hitlist,3) results[3] = textDoc #results for text hits return results else: results = [] # array results for query on an atomic single word titleDoc = self.getResultsForWord(hitlist, 0) results = self.appendResults(results,titleDoc) subTitleDoc = self.getResultsForWord(hitlist, 1) results = self.appendResults(results,subTitleDoc) categoryDoc = self.getResultsForWord(hitlist, 2) results = self.appendResults(results,categoryDoc) textDoc = self.getResultsForWord(hitlist, 3) results = self.appendResults(results,textDoc) return results #method to do a phrase query def phraseQuery(self,words): results = [] wordsList=[] for word in words: wordsList.append(self.getHitlist(word)) titleDoc = self.getResultsForPhrase(wordsList,0) # title hit results results = self.appendResults(results ,titleDoc) subTitleDoc = self.getResultsForPhrase(wordsList,1) # subtitle hit results results = self.appendResults(results,subTitleDoc) categoryDoc = self.getResultsForPhrase(wordsList, 2) # category hit results results =self.appendResults(results,categoryDoc) textDoc = self.getResultsForPhrase(wordsList,3) # text hit results results =self.appendResults(results,textDoc) # doing a query on terms of a phrase with limit of 300 more results if len(results) < 300: singleWordResults = [] for word in words: singleWordResults.append(self.oneWordQuery(word,True)) y = 50 - len(results) x = self.getMoreResults(singleWordResults,y) if x != None: results += x return results # parent query method and classifier def doQuery(self,words): results = {} queryWord=[] words = words.strip().split(" ") for word in words: word = word.lower() word = self.stemmer.stemWord(word) if word not in self.stop_words: queryWord.append(word) if len(queryWord) == 0: return {} elif len(queryWord) > 1: docIds = self.phraseQuery(queryWord) for id in docIds: results[id] = self.titles.get(id) else: docIds = self.oneWordQuery(queryWord[0], False) for id in docIds: results[id] = self.titles.get(id) return results
def stemming(data): stemmer = Stemmer("english") stemmedData = [stemmer.stemWord(key) for key in data] return stemmedData
print("Do u want to query ? (y/n) ") c = raw_input() while c[0] == 'y': print(" What is your phrase query ? ") words = raw_input().strip('\n').lower() clock_start = time.time() docs = defaultdict(float) #t is for title #p is for text #c is for category words = words.split(' ') for word in words: if (':' in word): word = word.split(':') word[1] = ps.stemWord(word[1]) if (word[0] == 't'): query_with_tag(word[1], 0) elif (word[0] == 'p'): query_with_tag(word[1], 1) elif (word[0] == 'c'): query_with_tag(word[1], 2) else: query_without_tag(word[1]) else: word = ps.stemWord(word) query_without_tag(word) relevance_ranking() print("Query time = " + str(time.time() - clock_start)) print("Do u want to query ? (y/n) ") c = raw_input()
index=0 for line in f: print index data=data+line[:-1]+". " count=count+1 if count%20==0: words=[] nouns=[] for t in data.split(): #print nltk.tag.str2tuple(t)[1] try: if nltk.tag.str2tuple(t)[1][0]=='N': #no_of_nouns=no_of_nouns+1 #n_n[stem.stemWord(nltk.tag.str2tuple(t)[0].lower())]=1 nouns.append(stem.stemWord(nltk.tag.str2tuple(t)[0].lower())) except: g=1 words.append(nltk.tag.str2tuple(t)[0].lower()) data="" train_set=[] train_labels=[] for i in range(0,len(words)-WINDOW): temp=[] for j in range(i,i+WINDOW): temp.append(words[j].lower()) if stem.stemWord(words[i+WINDOW].lower()) not in nouns: temp=[] continue
exit(0) else: import pdb pdb.set_trace() try: annlist = annotations['annotations'] for ann in annlist: start = ann['start'] end = ann['end'] if start == 0: if end == len(name): wiki_match, confidence = wikipedia_match(ann) else: title = ann['title'] name_words = set([stemmer.stemWord(word.lower()) for word in title.split() if word not in stopwords ]) title_words = set([stemmer.stemWord(word.lower()) for word in title.split() if word not in stopwords ]) if name_words == title_words: wiki_match, confidence = wikipedia_match(ann) except Exception as e: import pdb pdb.set_trace() if wiki_match:
for line in f: print correct, incorrect print index data = data + line[:-1] + ". " count = count + 1 if count % 20 == 0: words = [] nouns = [] for t in data.split(): #print nltk.tag.str2tuple(t)[1] try: if nltk.tag.str2tuple(t)[1][0] == 'N': #no_of_nouns=no_of_nouns+1 #n_n[stem.stemWord(nltk.tag.str2tuple(t)[0].lower())]=1 nouns.append( stem.stemWord(nltk.tag.str2tuple(t)[0].lower())) except: g = 1 words.append(nltk.tag.str2tuple(t)[0].lower()) data = "" train_set = [] train_labels = [] for i in range(0, len(words)): train_set.append(words[i].lower()) #train_labels.append(stem.stemWord(words[i+WINDOW].lower())) model = Word2Vec(train_set, min_count=1) #print train_set sum_ele = 0 max_sum = -10
class stemming: def __init__(self, language='english'): self.stemmer = Stemmer(language) def __call__(self, content): return [self.stemmer.stemWord(word) for word in content]
new_string = document_titles.readline().strip() output += new_string + "\n" print(output) # with open(outTxtFlPth, 'a+') as f: # print(output, file=f) ########################################################################### # make a list of all the stopwords. if os.path.exists(os.path.join(absltPthCurrPrgrm, 'stopwords.txt')): with open(os.path.join(absltPthCurrPrgrm, 'stopwords.txt'), 'r') as file: words = file.read().split('\n') # stem the stop word for word in words: word = ps.stemWord(word) if word: stopwords[word] = 1 else: print("stopwords.txt does not exist in the directory") sys.exit() for i in range(3): mapping[i] = index_term_mapping(i) create_offset() ########################################################################## # get queries into a list # with open(qryTxtFlPth) as f:
def stemmer(listofTokens): #Stemming stemmer=Stemmer("english") stemmedWords=[ stemmer.stemWord(key) for key in listofTokens ] return stemmedWords
#getting references ref = find_between(text, "eferences==", "==") + find_between( text, "eferences ==", "==") text = text.replace(ref, '') #clearing up the dictionary, and working on each field article_dict = {} #TITLE field_tokens = [] title = re.sub('[^A-Za-z]', ' ', title) chunk = nltk.word_tokenize(title.lower()) stopped_tokens = [i for i in chunk if not i in stop_words] for i in stopped_tokens: try: field_tokens.append(p_stemmer.stemWord(i)) except: field_tokens.append(i) for i in field_tokens: if i in article_dict.keys(): freq = int(find_between(article_dict[i], "(", ")")) + 1 if "T" in article_dict[i]: article_dict[i] = find_between(article_dict[i], "", "(") + "(%d)" % freq else: article_dict[i] = "T" + find_between( article_dict[i], "", "(") + "(%d)" % freq else: article_dict[i] = "T%d(1)" % count #BODY TEXT
'large', 'database', 'WSDM', 'web', 'search', 'data', 'mining', 'WWW', 'web' ] stemmer = Stemmer('english') new_keywords = set() for keyword in keywords: new_keywords.add(stemmer.stemWord(keyword.lower())) keywords = new_keywords conferences = defaultdict(int) def matches_keywords(title): title_keywords = set([stemmer.stemWord(w) for w in title.lower().split()]) return len(title_keywords.intersection(keywords)) def matches_confs(conf): return any([conf.strip() in real_conf for real_conf in confs]) citations_found = 0 with open('arnetminer_full.txt') as f:
TGrammar = Dict[str, RuleSet] def validate_grammar(grammar: TGrammar): for ruleset in grammar.values(): for rule in ruleset.rules: for t in rule.tokens: if isinstance(t, RefToken): assert t.target in grammar, f'Invalid target {t.target} ' \ f'in rule: \'{to_string(rule.tokens)}\' ' \ f'in ruleset \'{ruleset.name}\'' tokenizer = Tokenizer(text_postprocessing_fn=lambda t: STEMMER.stemWord(t)) def tokenize(text: str, parameters: bool) -> tuple[Token]: return tokenizer.tokenize(text) def grammar_from_dict(data: dict) -> TGrammar: grammar = {} for key, ruleset_raw in data.items(): rules = [] for raw_rule in ruleset_raw: if isinstance(raw_rule, str): rules.append(Rule(tokens=tokenize(raw_rule, parameters=True))) elif isinstance(raw_rule, dict): for k, v in raw_rule.items():
listofWords[i]=topofFile[i].split(':') if listofWords[i][0] not in heap: heapq.heappush(heap,listofWords[i][0]) if count == 100000: print("100000exceeded") writeintofile(tagno,path,data) data = defaultdict(list) count = 0 if count > 0 : writeintofile(tagno,path,data) data = defaultdict(list) with open('stopwords.txt','r') as file :#reading the stopwords (words to be ignored) words = file.read().split('\n')#putting the stopwords into a list "words" for x in words: x = ps.stemWord(x)#stem the stopwords stopwords[x]=1 doccnt = 0 docno = 0 for event,element in ET.iterparse(XMLLOC,events=("start","end")): chop = element.tag idx = chop.rfind("}")#it is a namespace actually, and we use namespaces because for eg if we have a field ID both in #as well as teacher class then we use a namespace like a link in XML. But we need the end portion after that namespace if idx != -1 : chop = chop[idx+1:] if chop == 'page' and event == 'end': #this is the code for a particular page for w in wordsdict : for t in tags : if cnt[tags[t]][w] > 0 :
def stemmer(listofTokens): #Stemming stemmer = Stemmer("english") stemmedWords = [stemmer.stemWord(key) for key in listofTokens] return stemmedWords
class Searcher(object): """Run a search on documents or objects within documents in the SQLite table Three scoring options are available: Frequency, TF-IDF and BM25 Two methods of incrementing the scores of results are available: simple addition or best score""" def __init__(self, query, db, doc_level_search=True, stemmer=False, path='/var/lib/philologic/databases/'): self.path = path + db + '/' self.words = query.split() self.doc_level_search = doc_level_search self.results = {} if doc_level_search: self.doc_path = self.path + 'doc_arrays/' else: self.doc_path = self.path + 'obj_arrays/' self.stemmer = stemmer if stemmer: try: from Stemmer import Stemmer self.stemmer = Stemmer(stemmer) # where stemmer is the language selected self.words = [self.stemmer.stemWord(word) for word in self.words] except KeyError: print >> sys.stderr, "Language not supported by stemmer. No stemming will be done." except ImportError: print >> sys.stderr, "PyStemmer is not installed on your system. No stemming will be done." def get_hits(self, word, doc=True): """Query the SQLite table and return a list of tuples containing the results""" cursor = sqlite_conn(self.path + 'hits_per_word.sqlite') if self.doc_level_search: cursor.execute('select doc_id, word_freq, total_words from doc_hits where word=?', (word,)) else: cursor.execute('select obj_id, word_freq, total_words from obj_hits where word=?', (word,)) return cursor.fetchall() def id_to_word(self, id): """Return the word given its ID""" m = mapper(self.path) return m[id] def get_idf(self, hits): """Return IDF score""" total_docs = doc_counter(self.doc_path) #### WRONG COUNT try: return log(float(total_docs) / float(len(hits))) + 1 except ZeroDivisionError: return 0 def search(self, measure='tf_idf', scoring='simple_scoring', intersect=False, display=10): """Searcher function""" self.intersect = False if self.words != []: for word in self.words: hits = self.get_hits(word) getattr(self, measure)(hits, scoring) if intersect: if self.intersect: self.docs = self.docs.intersection(self.new_docs) self.new_docs = set([]) else: self.intersect = True self.docs = set([obj_id for obj_id in self.results]) self.new_docs = set([]) if intersect: self.results = dict([(obj_id, self.results[obj_id]) for obj_id in self.results if obj_id in self.docs]) return sorted(self.results.iteritems(), key=itemgetter(1), reverse=True)[:display] else: return [] def debug_score(self, hits, scoring): for obj_id, word_freq, word_sum in hits: getattr(self, scoring)(obj_id, word_freq) def tf_idf(self, hits, scoring): idf = self.get_idf(hits) for obj_id, word_freq, word_sum in hits: tf = float(word_freq) / float(word_sum) score = tf * idf getattr(self, scoring)(obj_id, score) def frequency(self, hits, scoring): for obj_id, word_freq, word_sum in hits: score = float(word_freq) / float(word_sum) getattr(self, scoring)(obj_id, score) def bm25(self, hits, scoring, k1=1.2, b=0.75): ## a floor is applied to normalized length of doc ## in order to diminish the importance of small docs ## see http://xapian.org/docs/bm25.html idf = self.get_idf(hits) avg_dl = avg_doc_length(self.path) for obj_id, word_freq, obj_length in hits: tf = float(word_freq) dl = float(obj_length) temp_score = tf * (k1 + 1.0) temp_score2 = tf + k1 * ((1.0 - b) + b * floor(dl / avg_dl)) score = idf * temp_score / temp_score2 getattr(self, scoring)(obj_id, score) def simple_scoring(self, obj_id, score): if self.intersect: self.new_docs.add(obj_id) if obj_id not in self.results: self.results[obj_id] = score else: self.results[obj_id] += score def dismax_scoring(self, obj_id, score): if self.intersect: self.new_docs.add(obj_id) if obj_id not in self.results: self.results[obj_id] = score else: if score > self.results[obj_id]: self.results[obj_id] = score def lda_search(self, measure='tf_idf', scoring='simple_scoring', intersect=False, display=10): """Searcher function""" self.intersect = False self.words = [words.decode('utf-8') for words in self.words] if self.words != []: lda_query = self.match_topic() if lda_query != None: for word in self.words[:1]: # temporary slice, to offer it as an option? lda_query[word] = sum([lda_query[term] for term in lda_query]) print lda_query self.num_hits = {} for other_word, freq in lda_query.iteritems(): hits = self.get_hits(other_word) results = self.lda_scoring(hits, scoring, freq, measure) self.results = dict([(obj_id, self.results[obj_id] * self.num_hits[obj_id]) for obj_id in self.results if self.num_hits[obj_id] > 1]) return sorted(self.results.iteritems(), key=itemgetter(1), reverse=True)[:display] else: return [] else: return [] def match_topic(self): topic_id = int cursor = sqlite_conn(self.path + 'lda_topics.sqlite') if len(self.words) == 1: cursor.execute('select topic, position from word_position where word=? order by position', (self.words[0],)) try: topic_id = cursor.fetchone()[0] except TypeError: return None else: topic_pos = {} topic_matches = {} query = 'select topic, position from word_position where word="%s"' % self.words[0] for word in self.words[1:]: query += ' or word="%s"' % word cursor.execute(query) for topic, position in cursor.fetchall(): if topic not in topic_pos: topic_pos[topic] = position topic_matches[topic] = 1 else: topic_pos[topic] += position topic_matches[topic] += 1 word_num = len(self.words) topics = [(topic, topic_pos[topic]) for topic in topic_pos if topic_matches[topic] == word_num] if topics == []: topics = [(topic, topic_pos[topic]) for topic in topic_pos if topic_matches[topic] == word_num - 1] topic_id = sorted(topics, key=itemgetter(1))[0][0] cursor.execute('select words from topics where topic=?', (topic_id,)) results = json.loads(cursor.fetchone()[0]) topic = [(term, float(freq)) for term, freq in results.iteritems()]# if float(freq) > 0.01] topic = dict(sorted(topic, key=itemgetter(1), reverse=True)[:10]) return topic def lda_scoring(self, hits, scoring, freq, measure): if measure == 'tf_idf': idf = self.get_idf(hits) for obj_id, word_freq, word_sum in hits: tf = float(word_freq) / float(word_sum) score = tf * idf * freq if obj_id not in self.results: self.results[obj_id] = score self.num_hits[obj_id] = 1 else: self.results[obj_id] += score self.num_hits[obj_id] += 1 else: idf = self.get_idf(hits) avg_dl = avg_doc_length(self.path) k1 = 1.2 b = 0.75 for obj_id, word_freq, obj_length in hits: tf = float(word_freq) dl = float(obj_length) temp_score = tf * (k1 + 1.0) temp_score2 = tf + k1 * ((1.0 - b) + b * floor(dl / avg_dl)) score = idf * temp_score / temp_score2 * freq if obj_id not in self.results: self.results[obj_id] = score self.num_hits[obj_id] = 1 else: self.results[obj_id] += score self.num_hits[obj_id] += 1
porter = PorterStemmer() snowball = SnowballStemmer("english") isri = ISRIStemmer() rslp = RSLPStemmer() porter2 = Stemmer('english') endOfString = StringEnd() prefix = oneOf( "uni inter intro de con com anti pre pro per an ab ad af ac at as re in im ex en em un dis over sub syn out thermo philo geo for fore back" ) suffix = oneOf("ish") #suffix = oneOf("or er ed ish ian ary ation tion al ing ible able ate ly ment ism ous ness ent ic ive " # "ative tude ence ance ise ant age cide ium ion") word = (Optional(prefix)("prefixes") + SkipTo(suffix | suffix + FollowedBy(endOfString) | endOfString)("root") + ZeroOrMore(suffix | suffix + FollowedBy(endOfString))("suffix")) #word = (Optional(prefix)("prefixes") + SkipTo(FollowedBy(endOfString))("root")) for wd in wordlist: print wd stem = lanster.stem(wd) print "LansterStemmer:" + stem print "PorterStemmer2:" + porter2.stemWord(wd) #res = word.parseString(stem) #print res.dump() #print finally: file.close()
def process_text(text, stemming=True): words = _tokenize(text) if not stemming: return words stemmer = Stemmer('english') return [stemmer.stemWord(word) for word in words]