def _pass2_process_token(self, document, position, zone, token): # Vector space structure # vector_space[d][t][0] = normalized frequency of term t in document d # vector_space[d][t][1] = positions of term t in document d for each zone # vector_space[d][t][2] = frequency of term t in document d # positions are in this format: ZoneNumber | position # Ensure token is in lowercase and eligible for index if token in constants.DO_NOT_INDEX or len(token) <= 1: return p = PorterStemmer() # First, let's stem the token token = token.lower() token = p.stem(token.lower(), 0, len(token) - 1) # Find term's index in vector space if token not in self._ifile: return t = self._ifile[token][0] if not t in self._vector_space[document.document_id]: self._vector_space[document.document_id][t] = [ 0.0, [[], [], [], []], 0 ] self._vector_space[document.document_id][t][0] = \ (Zones.WEIGHTS[zone] / document.weighted_length) \ + self._vector_space[document.document_id][t][0] self._vector_space[document.document_id][t][1][zone].append(position) self._vector_space[document.document_id][t][2] += 1
def _pass2_process_token(self, document, position, zone, token): # Vector space structure # vector_space[d][t][0] = normalized frequency of term t in document d # vector_space[d][t][1] = positions of term t in document d for each zone # vector_space[d][t][2] = frequency of term t in document d # positions are in this format: ZoneNumber | position # Ensure token is in lowercase and eligible for index if token in constants.DO_NOT_INDEX or len(token) <= 1: return p = PorterStemmer() # First, let's stem the token token = token.lower() token = p.stem(token.lower(), 0,len(token)-1) # Find term's index in vector space if token not in self._ifile: return t = self._ifile[token][0] if not t in self._vector_space[document.document_id]: self._vector_space[document.document_id][t] = [0.0, [[],[],[],[]],0] self._vector_space[document.document_id][t][0] = \ (Zones.WEIGHTS[zone] / document.weighted_length) \ + self._vector_space[document.document_id][t][0] self._vector_space[document.document_id][t][1][zone].append(position) self._vector_space[document.document_id][t][2] += 1
def _pass1_process_token(self, doc_id, token): # Inverted file structure: # self._ifile[token] = [id, df, postings_list] # Let's make sure the token is not in our "do not index" if token in constants.DO_NOT_INDEX or len(token) <= 1: return p = PorterStemmer() # First, let's stem the token token = token.lower() token = p.stem(token.lower(), 0, len(token) - 1) with self._ifile_lock: if self._ifile is None: logging.error("INDEXER-P1-THREAD-%d: Attempting to index" " a document while index file is closed" % thread_no) raise Exception("Index file has been closed") if token not in self._ifile: self._ifile[token] = [0, 0, set()] self._ifile[token][2].add(doc_id)
def _pass1_process_token(self, doc_id, token): # Inverted file structure: # self._ifile[token] = [id, df, postings_list] # Let's make sure the token is not in our "do not index" if token in constants.DO_NOT_INDEX or len(token) <= 1: return p = PorterStemmer() # First, let's stem the token token = token.lower() token = p.stem(token.lower(), 0,len(token)-1) with self._ifile_lock: if self._ifile is None: logging.error("INDEXER-P1-THREAD-%d: Attempting to index" " a document while index file is closed" % thread_no) raise Exception("Index file has been closed") if token not in self._ifile: self._ifile[token] = [0,0,set()] self._ifile[token][2].add(doc_id)
def from_string(query): """Parse specified query and return query object""" queryObj = Query() queryObj.phrase_search = False query = query.strip().lower() # Determine if it's a "command" query if (query.startswith("similar ") or query.startswith("df ") or \ query.startswith("freq ") or query.startswith("doc ") or \ query.startswith("tf ") or query.startswith("title ")) and \ len(query.split(" ")) > 1: queryObj.cmd = query.split(" ")[0].strip() query = query.replace(queryObj.cmd + " ", "", 1) # remove cmd # from query str # For "tf " queries, extract first parameter early on, so we # don't have to hack later when we process the query terms if queryObj.cmd == "tf": if len(query.split(" ")) < 2: # This is not a valid "tf " query queryObj.cmd = None else: queryObj.raw_terms.append(query.split(" ")[0]) query = " ".join(query.split(" ")[1:]) # Clean up and determine if phrase search if query.replace("!", "").startswith('"'): queryObj.phrase_search = True last_grp = None gid = 0 _groups = [] # Populate groups if not queryObj.phrase_search: for group in query.split(" "): if group.strip().startswith("!"): _groups.append(group.strip()[1:]) queryObj.negated_groups[gid] = True gid = gid + 1 else: _groups.append(group.strip()) queryObj.negated_groups[gid] = False gid = gid + 1 else: for group in query.split('"'): if group.strip(' "') == '': continue if group.strip(' "') == '!': last_grp = group continue if last_grp is not None and "!" in last_grp: _groups.append(group) queryObj.negated_groups[gid] = True gid = gid + 1 else: _groups.append(group) queryObj.negated_groups[gid] = False gid = gid + 1 last_grp = group # Stem tokens in groups (except for "similar" queries) # and remove inelgible tokens for group in _groups: _query_terms = [] if queryObj.cmd == "doc" or queryObj.cmd == "title": _query_terms = group.split(" ") else: _query_terms = re.compile(indexer.constants.DELIMITERS).split(group) query_terms = [] for term in _query_terms: term = term.lower() if term not in indexer.constants.DO_NOT_INDEX: queryObj.raw_terms.append(term) # Stem if queryObj.cmd != "similar": p = PorterStemmer() term = term.lower() term = p.stem(term, 0,len(term)-1) query_terms.append(term) queryObj.groups.append(' '.join(query_terms)) return queryObj