class QA: def __init__(self): self.file_name = 'qa.txt' self.qa_list = {} self.qa_id = 0 self.prep = Preprocessor() self.mx = Matrix() self.metric = Metrics() def randomize(self, a): for i in range(len(a)): a[i] = random.randint(0,1) def readfile(self): fd = open(self.file_name,'r') for line in fd.readlines(): line = line.strip().lower().split(':') if len(line) != 2: continue elif line[0] == 'q': q_line = ' '.join(line[1:]) self.qa_id += 1 self.qa_list[self.qa_id] = {'q': q_line, 'a': ''} terms = self.prep.ngram_tokenizer(text=q_line) self.mx.add_doc(doc_id=self.qa_id, doc_terms=terms, frequency=True, do_padding=True) elif line[0] == 'a': a_line = ' '.join(line[1:]) self.qa_list[self.qa_id]['a'] = a_line #print 'Number of read questions and answers:', len(self.mx.docs) #print 'Number of read terms', len(self.mx.terms) def ask(self, q=''): q_id = 0 q_distance = 99999 terms = self.prep.ngram_tokenizer(text=q) q_vector = self.mx.query_to_vector(terms, frequency=False) if sum(q_vector) == 0: self.randomize(q_vector) for doc in self.mx.docs: distance = self.metric.euclid_vectors(doc['terms'], q_vector) if distance < q_distance: q_distance = distance q_id = doc['id'] print 'Tarek:', self.qa_list[q_id]['a']
def validateResponse(self, response, queryTerms): from bs4 import BeautifulSoup if response.status_code == 200: soup = BeautifulSoup(response.text) from irlib.preprocessor import Preprocessor from irlib.matrix import Matrix from irlib.metrics import Metrics prep = Preprocessor() mx = Matrix() metric = Metrics() terms = prep.ngram_tokenizer(text=soup.get_text()) mx.add_doc(doc_id=response.url, doc_terms=terms, frequency=True, do_padding=True) '''for doc in mx.docs: distance = metric.euclid_vectors(doc['terms'], q_vector) print distance ''' cnt = Counter() for word in terms: cnt[word] += 1 tableTerms = PrettyTable(["Term", "Frequency"]) for word in sorted(cnt, key=cnt.get, reverse=True): if word.encode('ascii').lower() in queryTerms.encode( 'ascii').lower().split(): tableTerms.add_row([word, cnt[word]]) print tableTerms else: print "[-] Response for %s is %s " % (response.url, response.status_code)
def validateResponse(self, response, queryTerms): from bs4 import BeautifulSoup if response.status_code == 200: soup = BeautifulSoup(response.text) from irlib.preprocessor import Preprocessor from irlib.matrix import Matrix from irlib.metrics import Metrics prep = Preprocessor() mx = Matrix() metric = Metrics() terms = prep.ngram_tokenizer(text=soup.get_text()) mx.add_doc( doc_id=response.url,doc_terms=terms,frequency=True,do_padding=True) '''for doc in mx.docs: distance = metric.euclid_vectors(doc['terms'], q_vector) print distance ''' cnt = Counter() for word in terms: cnt[word] += 1 tableTerms = PrettyTable(["Term", "Frequency"]) for word in sorted(cnt, key=cnt.get, reverse=True): if word.encode('ascii').lower() in queryTerms.encode('ascii').lower().split(): tableTerms.add_row([word, cnt[word]]) print tableTerms else: print "[-] Response for %s is %s " %(response.url, response.status_code)
def __validateResponse(self, response, queryTerms): from bs4 import BeautifulSoup if response.status_code == 200: soup = BeautifulSoup(response.text) from irlib.preprocessor import Preprocessor from irlib.matrix import Matrix from irlib.metrics import Metrics prep = Preprocessor() mx = Matrix() metric = Metrics() terms = prep.ngram_tokenizer(text=soup.get_text()) mx.add_doc( doc_id=response.url,doc_terms=terms,frequency=True,do_padding=True) cnt = Counter() for word in terms: cnt[word] += 1 table = Texttable() table.set_cols_align(["l", "l"]) table.set_cols_valign(["m", "m"]) table.set_cols_width([40,55]) rows = [["Term", "Frequency"]] for word in sorted(cnt, key=cnt.get, reverse=True): if word.lower() in queryTerms.lower().split(): rows.append([word, cnt[word]]) table.add_rows(rows) print table.draw() + "\n" else: print "[-] Response for %s is %s " %(response.url, response.status_code)
def __validateResponse(self, response, queryTerms): from bs4 import BeautifulSoup if response.status_code == 200: soup = BeautifulSoup(response.text) from irlib.preprocessor import Preprocessor from irlib.matrix import Matrix from irlib.metrics import Metrics prep = Preprocessor() mx = Matrix() metric = Metrics() terms = prep.ngram_tokenizer(text=soup.get_text()) mx.add_doc(doc_id=response.url, doc_terms=terms, frequency=True, do_padding=True) cnt = Counter() for word in terms: cnt[word] += 1 table = Texttable() table.set_cols_align(["l", "l"]) table.set_cols_valign(["m", "m"]) table.set_cols_width([40, 55]) rows = [["Term", "Frequency"]] for word in sorted(cnt, key=cnt.get, reverse=True): if word.lower() in queryTerms.lower().split(): rows.append([word, cnt[word]]) table.add_rows(rows) print table.draw() + "\n" else: print "[-] Response for %s is %s " % (response.url, response.status_code)
def readfiles(fold_path='all-folds/fold1/'): prep = Preprocessor() mx = Matrix() files = os.listdir(fold_path) for filename in files: fd = open('%s/%s' % (fold_path, filename), 'r') file_data = fd.read() terms = prep.ngram_tokenizer(text=file_data) mx.add_doc(doc_id=filename, doc_terms=terms, frequency=True, do_padding=True) print 'Number of read documents:', len(mx.docs) print 'Number of read terms', len(mx.terms) #print mx.terms[0:5], mx.terms[-5:-1] print mx.terms print mx.docs
class Search: def __init__(self): self._mx = Matrix() self._prep = Preprocessor(pattern='\W+', lower=True, stem=True) def readfiles(self, fold_path='all-folds/fold1/'): ruta = os.path.split(sys.argv[0]) abs = os.path.join(ruta[0], fold_path) files = os.listdir(abs) for filename in files: abs_arch = os.path.join(abs, filename) fd = open(abs_arch, 'r') file_data = fd.read() self.createMX(filename, file_data) print 'Number of read documents:', len(self._mx.docs) print 'Number of read terms', len(self._mx.terms) #print mx.terms[0:5], mx.terms[-5:-1] '''print mx.terms for doc in mx.docs: print doc''' self.saveMX(self._mx) print 'proceso culminado' def saveMX(self, mx): ruta = os.path.split(sys.argv[0]) abs = os.path.join(ruta[0], "db/matrix.mx") filemx = open(abs, 'w') serializer = Pickler(filemx) serializer.dump(mx) print 'matrix salvada' def createMX(self, file_id, file_data, lenguaje = 'english'): stop = stopwords.words(lenguaje) file = file_data.split(" ") content = [w for w in file if w.lower() not in stop] data = content.__str__() terms = self._prep.ngram_tokenizer(text=data) if len(terms) > 0: self._mx.add_doc(doc_id=file_id, doc_terms=terms, frequency=True, do_padding=True) def search(self): ruta = os.path.split(sys.argv[0]) abs = os.path.join(ruta[0], "db/matrix.mx") filemx = open(abs, 'r') serializer = Unpickler(filemx) self._mx = serializer.load() cadena = sys.argv del cadena[0] del cadena[0] cade = cadena.__str__() cade = cade.lower() cad = self._prep.ngram_tokenizer(text=cade) resultado = list() for doc in self._mx.docs: vector = list() for q in cad: if q in self._mx.terms: pos = self._mx.terms.index(q) vector.append(doc['terms'][pos]) resultado.append((doc['id'],vector)) resultado.sort(lambda a,b: self.__Deuclidiana(a[1]) - self.__Deuclidiana(b[1]), reverse = True) print resultado def __Deuclidiana(self, vector): dist = 0 for v in vector: dist+=v**2 return dist.__int__() def main(self): #self.readfiles() self.search()
def test_3gram_tokenizer(self): p = Preprocessor(lower=False, stem=False, ngram=3) returned_tokens = p.ngram_tokenizer('how do you do?') expected_tokens = ['how do you', 'do you do'] self.assertEqual(returned_tokens, expected_tokens)