def insert_article(self, art): exists = self.session.query(Article).filter_by(url=art['url']).first() article = Article(url=art['url'], title=art['title'], date=art['date']) if not exists: self.session.add(article) logger.write_log("added: " + art['title'] + " to Database") self.session.commit()
def insert_location(self, location): exists = self.session.query(Location).filter_by( name=location['name']).first() if not exists: loc = Location(name=location['name']) self.session.add(loc) logger.write_log("added: " + location['name'] + " to database") self.session.commit()
def insert_organisation(self, organisation): exists = self.session.query(Organisation).filter_by( name=organisation['name']).first() if not exists: org = Organisation(name=organisation['name']) self.session.add(org) logger.write_log("added: " + organisation['name'] + " to database") self.session.commit()
def insert_person(self, person): exists = self.session.query(Person).filter_by( name=person['name']).first() if not exists: pers = Person(name=person['name']) self.session.add(pers) logger.write_log("added: " + person['name'] + " to database") self.session.commit()
def get_internal_links(self): logger.write_log("visiting: " + self.url) links = self.get_links() internal_links = [] for link in links: if "brainpickings.org/20" in link and link not in internal_links: internal_links.append(link) return internal_links
def insert_reference(self, ref): # Generate unique id for db sha_id = hashlib.sha1(bytes(ref['url'] + ref['ref'], 'utf-8')) ref['id'] = sha_id.hexdigest() exists = self.session.query(Reference).filter_by(id=ref['id']).first() if not exists: reference = Reference(id=ref['id'], url=ref['url'], ref=ref['ref']) self.session.add(reference) logger.write_log("added reference from: " + ref['url'] + " to: " + ref['ref']) self.session.commit()
def get_people(self): logger.write_log("getting people from: " + self.url) persons = self.ner.get_persons() main = 0 if persons: for pers in persons: pers['am_i_main'] = False if pers['count'] > persons[main]['count']: main = persons.index(pers) persons[main]['am_i_main'] = True return persons
def insert_location_rel(self, rel): sha_id = hashlib.sha1(bytes(rel['article'] + rel['location'], 'utf-8')) rel['id'] = sha_id.hexdigest() exists = self.session.query(LocationRel).filter_by( id=rel['id']).first() if not exists: relation = LocationRel(id=rel['id'], article=rel['article'], location=rel['location'], count=rel['count']) self.session.add(relation) logger.write_log("added relation from: " + rel['article'] + " to: " + rel['location']) self.session.commit()
def optimize_my_database(): logger.write_log("Optimize one-offs") delete_one_offs() logger.write_log("Optimize too-longs") delete_five_or_more() logger.write_log("Optimize duplication") fix_name_duplication() fix_case_duplication() logger.write_log("Optimize not-letters") fix_symbols() # logger.write_log("Fix link-errors") # link_errors() logger.write_log("Verify") verify_all_with_dbpedia()
def bp_index(self): """ Main index method, iterates all dates within a range :return: a list of dicts { 'url': url, 'title': title, 'date': date object } """ logger.write_log("Indexing from: " + str(self.start_date) + " to: " + str(self.end_date)) db = DBSession() articlelist = [] delta = dt.timedelta(days=1) while self.start_date <= self.end_date: page = self.fetch_page(self.start_date.year, self.start_date.month, self.start_date.day) if (page != "empty"): for article in page: articlelist.append(article) db.insert_article(article) self.start_date += delta
def fetch_page(self, y, m, d): """ Constructs an url and checks if it contains a page. :param y: year :param m: month :param d: day :return: a list of dicts { 'url': url, 'title': title, 'date': date object } """ url = self.baseurl + "/" + dts(y) + "/" + dts(m) + "/" + dts(d) + "/" response = read(url, self.local) if (response == "empty"): return response else: logger.write_log("visiting: " + url) title = url.replace("https://", "") title = title.replace("/", ":") # Save the article locally if self.save and not self.local: save_html(response, "html_collection_pages/" + title) articles = self.fetch_articles(response) for article in articles: article['date'] = dt.date(y, m, d) return articles
def read_url(url): try: response = urlopen(url) except urllib.error.HTTPError as e: logger.write_log(url + " : " + e.__str__()) return "empty" except urllib.error.URLError as e: logger.write_log(url + " : " + e.__str__()) return "empty" if response.getheader( 'Content-Type' ) == "text/html; charset=UTF-8": # Make sure that page is HTML html_bytes = response.read() html_string = html_bytes.decode("utf-8") logger.write_log(url + " : " + "Added to index") else: print(url + "not crawlable") logger.write_log(url + " : " + "Page not html/utf-8") return "empty" return html_string
def save_html(html_string, location): location += ".html" file = open(location, "w") file.write(html_string) file.close() logger.write_log(location + " written to file")
def get_organisations(self): logger.write_log("getting orginisations from: " + self.url) return self.ner.get_organisations()
def get_locations(self): logger.write_log("getting locations from: " + self.url) return self.ner.get_locations()