def getLinks(url, soup): """ Returns a list of urls """ currentWords = [] listURLs = [] wordsInPage = soup.get_text() for word in wordsInPage.split(): currentWords.append(word) if word not in db: db[word] = [(util.clean_words(soup.title.text), url)] elif word not in currentWords: db[word].append((util.clean_words(soup.title.text), url)) for link in soup.find_all("a"): newUrl = urllib.parse.urljoin(url, link.get("href")) listURLs.append(newUrl) return listURLs
def search(): """Returns the results page.""" query = util.clean_words(request.args.get("query", "")) if query == "": return render_template("index.html") else: results = moogle.answer(app.db, query) return render_template("search.html", authors=moogle.authors(), query=query, results=results)
def sanitizeText(text): try: text = util.clean_words(text) except: text.encode('utf-8') text = text.split(' ') filter(None, text) return [word for word in text if word not in STOP_WORDS]
def make_data(self, trainfilename, maxseqlen=None, maxclauselen=None, label_ind=None, train=False): use_attention = self.params["use_attention"] batch_size = self.params["batch_size"] str_seqs, label_seqs = read_passages(trainfilename, is_labeled=train) print("Filtering data") str_seqs = clean_words(str_seqs) label_seqs = to_BIO(label_seqs) if not label_ind: self.label_ind = {"none": 0} else: self.label_ind = label_ind seq_lengths = [len(seq) for seq in str_seqs] if self.maxseqlen is None: if maxseqlen: self.maxseqlen = maxseqlen elif self.params["maxseqlen"] is not None: self.maxseqlen = self.params["maxseqlen"] else: self.maxseqlen = max(seq_lengths) if self.maxclauselen is None: if maxclauselen: self.maxclauselen = maxclauselen elif self.params["maxclauselen"] is not None: self.maxclauselen = self.params["maxclauselen"] elif use_attention: sentence_lens = [] for str_seq in str_seqs: for seq in str_seq: tokens = self.tokenizer.tokenize(seq.lower()) sentence_lens.append(len(tokens)) self.maxclauselen = np.round( np.mean(sentence_lens) + 3 * np.std(sentence_lens)).astype(int) if len(self.label_ind) <= 1: for str_seq, label_seq in zip(str_seqs, label_seqs): for label in label_seq: if label not in self.label_ind: # Add new labels with values 0,1,2,.... self.label_ind[label] = len(self.label_ind) self.rev_label_ind = {i: l for (l, i) in self.label_ind.items()} discourse_generator = BertDiscourseGenerator(self.bert, self.tokenizer, str_seqs, label_seqs, self.label_ind, batch_size, use_attention, self.maxseqlen, self.maxclauselen, train) return seq_lengths, discourse_generator # One-hot representation of labels
def crawler_aux(db, url, maxdist): if maxdist > 0: try: response = urllib.request.urlopen(url) page = response.read() soup = BeautifulSoup(page, "html.parser") db[url] = (clean_words(soup.title.text), set( clean_words(soup.get_text() + soup.title.text).split(" "))) for link in soup.find_all("a"): link_str = urllib.parse.urljoin(url, link.get("href")) if link_str not in db and link_str[len(link_str) - 5:] == '.html': crawler_aux(db, link_str, maxdist - 1) except Exception as e: print("Exception found while reading the webpage {}\n{}\n".format( url, e))
def make_data(self, trainfilename, maxseqlen=None, maxclauselen=None, label_ind=None, train=False): use_attention = self.params["use_attention"] maxseqlen = self.params["maxseqlen"] maxclauselen = self.params["maxclauselen"] batch_size = self.params["batch_size"] str_seqs, label_seqs = read_passages(trainfilename, is_labeled=train) print("Filtering data") str_seqs = clean_words(str_seqs) label_seqs = to_BIO(label_seqs) if not label_ind: self.label_ind = {"none": 0} else: self.label_ind = label_ind seq_lengths = [len(seq) for seq in str_seqs] if not maxseqlen: maxseqlen = max(seq_lengths) if not maxclauselen: if use_attention: clauselens = [] for str_seq in str_seqs: clauselens.extend( [len(clause.split()) for clause in str_seq]) maxclauselen = np.round( np.mean(clauselens) + 3 * np.std(clauselens)).astype(int) X = [] Y = [] Y_inds = [] init_word_rep_len = len(self.rep_reader.word_rep) # Vocab size if len(self.label_ind) <= 1: for str_seq, label_seq in zip(str_seqs, label_seqs): for label in label_seq: if label not in self.label_ind: # Add new labels with values 0,1,2,.... self.label_ind[label] = len(self.label_ind) self.rev_label_ind = {i: l for (l, i) in self.label_ind.items()} discourse_generator = DiscourseGenerator(self.rep_reader, str_seqs, label_seqs, self.label_ind, batch_size, use_attention, maxseqlen, maxclauselen, train, self.input_size) self.maxseqlen = maxseqlen self.maxclauselen = maxclauselen return seq_lengths, discourse_generator # One-hot representation of labels
def main(): parser = argparse.ArgumentParser( description="Answer module for the µoogle project", epilog=moogle.authors(), ) parser.add_argument("-q", "--query", type=str, help="query (use quotes for more than one word") parser.add_argument("-d", "--database", type=str, help="filename of the database", default="moogle.dat") args = parser.parse_args(sys.argv[1:]) db = moogle.load(args.database) query = util.clean_words(args.query) answer = moogle.answer(db, query) pprint.pprint(answer)
def calc(self, input_str, points): # Trivial case when there are no words. if input_str is None: return try: clear = util.clean_words( input_str) # Clean the words from "litter". words = clear.split(" ") # Split the string into several words. # We check for every word of the string. for word in words: if word == '': continue sum_points = self.words_weight.get( word) # Initially set to 0, we will accumulate the score. if word in self.CONST_STOP_WORDS: # If the word is actually a STOP WORD... x = points * 0.1 else: x = points if sum_points is None: self.words_weight[word] = x else: self.words_weight[word] += x except ValueError as e: print(e)
def crawler(url, maxdist): """ Crawls the web starting from url, following up to maxdist links and returns the built database. """ # Please implement this function dict = {} # db: diccionari (paraula apareguda -> [(titol,url) de web)] webs = [] # llista de webs a tractar webs.append(tuple((url, 0))) # la url original, amb profunditat 0 # A webs guardarem totes les webs a guardar, amb parells (url,depth) # Realment, no podem saber si un link es HTML o no fins que no l'obrim # i mirem el content type. AMb l'extensio no n'hi ha prou. # Per exemple, apple.com/education es HTML perque el servidor serveix # el corresponent index.html # for (web, depth) in webs: i = 0 used_webs = set([url]) # webs ja visitades # utilitzem while en comptes d'un iterador perque la llista es # dinamica (es fan appends dins de l'iteracio), i llavors queda # mes clar aixi. while i < len(webs): (web, depth) = webs[i] try: # timeout de 3, el valor per defecte es massa gran response = urllib.request.urlopen(web, timeout=3) content_type = response.info().get('Content-Type') # Posem aquesta condicio en comptes de # content_type = 'text/html', perque algunes webs # indiquen mes coses al content type if "html" not in content_type: break page = response.read() soup = BeautifulSoup(page, "html.parser") # Si salta alguna excepcio sera la corresponent a un error # a l'establir la connexio (de sockets, handshake...) o # HTTP (404 not found, 403 forbidden...) except Exception as e: print(e) i += 1 continue try: # Com hem dit, cada paraula trobada en alguna de les # webs tindra una entrada al diccionari amb el # conjunt de les webs (titol,url) on aparegui title = clean_words(soup.title.text) text = clean_words(soup.get_text()) content = title + text added_words = set([]) # Per totes les paraules del contingut de la web for word in content.split(): if word in added_words: continue if word not in dict: dict[word] = set([tuple((title, web))]) else: dict[word].add(tuple((title, web))) added_words.add(word) # Es podria produir una excepcio si una web no tingues # contingut, o si per algun caracter especial falles # clean_words except Exception as e: print(e) # Afegim tots els links no repetits i amb protocol hhtp # o https # Si ja estem a la maxima profunditat, no mirarem cap link # Sumarem 1 a la profunditat dels fills, respecte la del pare if depth < maxdist: for link in soup.find_all("a"): try: newurl = urllib.parse.urljoin(web, link.get("href")) (newweb, fragment) = urllib.parse.urldefrag(newurl) if newweb.startswith("http") and newweb not in used_webs: used_webs.add(newweb) webs.append(tuple((newweb, depth + 1))) # urljoin pot provocar una excepcio si la url # esta mal formada except Exception as e: print(e) continue i += 1 return dict
def get_title(self): return util.clean_words(self.title)