def save_to_db(self, publications): storage = Storage() storage.connect() self.print_unicode(u"Saving publications for author {0} - {1} to db...".format(self.author_id, self.author_name)) map(lambda publication: self.save_publication_to_db(publication, storage), publications) storage.disconnect()
def update_entry_in_db(self, publication, update_values): publication_id = publication['ID'] publication_title = publication['Title'] self.print_unicode( u"Updating publication {0} - {1} as in database...".format( publication_id, publication_title)) storage = Storage() storage.connect() publications_table = meta.tables['ms_academic_publications'] query = sqlalchemy.update(publications_table).where(publications_table.c.publication_id == publication_id)\ .values(update_values) print query storage.execute(query) storage.disconnect()
def update_entry_in_db(self, publication, update_values): publication_id = publication['ID'] publication_title = publication['Title'] self.print_unicode(u"Updating publication {0} - {1} as in database...".format(publication_id, publication_title)) storage = Storage() storage.connect() publications_table = meta.tables['ms_academic_publications'] query = sqlalchemy.update(publications_table).where(publications_table.c.publication_id == publication_id)\ .values(update_values) print query storage.execute(query) storage.disconnect()
def save_to_db(self, publications): storage = Storage() storage.connect() self.print_unicode( u"Saving publications for author {0} - {1} to db...".format( self.author_id, self.author_name)) map( lambda publication: self.save_publication_to_db( publication, storage), publications) storage.disconnect()
def __init__(self): self.storage = Storage() self.jokes_set = Set()
class JokesScraper: def __init__(self): self.storage = Storage() self.jokes_set = Set() def get_jokes(self): categories = self.get_categories() jokes = [] for category in categories: jokes_for_category = self.get_jokes_by_category(category) self.process_jokes(jokes_for_category, category) jokes.extend(jokes_for_category) print return jokes def get_jokes_by_category(self, category): print "Getting jokes for category {0}".format(category["name"]) all_jokes = [] [jokes_on_page, older_posts] = self.get_jokes_on_page(category["url"]) all_jokes.extend(jokes_on_page) while older_posts: url = older_posts[0].attrib["href"] print "Getting next page " + url [jokes_on_page, older_posts] = self.get_jokes_on_page(url) all_jokes.extend(jokes_on_page) return all_jokes def get_jokes_on_page(self, url): req = urllib2.Request(url=url, headers={"User-Agent": constants.USER_AGENT}) response = urllib2.urlopen(req) html_parser = etree.HTMLParser() tree = etree.parse(response, html_parser) jokes = tree.xpath(constants.JOKES_SELECTOR) older_posts = tree.xpath(constants.OLDER_POSTS_SELECTOR) for element in jokes: print element.text + " " + element.attrib["href"] return [jokes, older_posts] def get_categories(self): self.storage.connect() categories = meta.tables["subcategories"] query = sqlalchemy.select([categories.c.id, categories.c.name, categories.c.url]) results = self.storage.execute(query) return results def process_jokes(self, jokes, category): for joke in jokes: joke_data = self.process_joke(joke.attrib["href"]) if joke_data: self.save_joke_to_db(joke_data, category) def process_joke(self, url): req = urllib2.Request(url=url, headers={"User-Agent": constants.USER_AGENT}) response = urllib2.urlopen(req) # print response.read() html_parser = etree.HTMLParser() tree = etree.parse(response, html_parser) title = tree.xpath(constants.TITLE_SELECTOR) content = tree.xpath(constants.CONTENT_SELECTOR) text_extractor = HtmlTextExtractor(content) content_text = text_extractor.extract_text() rating = tree.xpath(constants.RATING_SELECTOR) votes = tree.xpath(constants.VOTES_SELECTOR) comments_count = tree.xpath(constants.COMMENTS_SELECTOR) likes = 0 # self.fb_social_plugin_get_likes(tree, url) if title and content_text and rating and votes: print title[0].text print content_text print rating[0].text print votes[0].text if comments_count: print comments_count[0].text count = comments_count[0].text.split(" ")[0] if count == "No": count = 0 else: count = 0 print count return { "title": title[0].text, "content": content_text, "rating": rating[0].text, "votes": votes[0].text, "likes": likes, "url": url, "comments_count": count, } return None # self.get_comments(url) def fb_social_plugin_get_likes(self, tree, joke_url): # TODO likes = tree.xpath("/html//div[@data-href='" + joke_url + "']") print likes if likes: print likes[0].attrib print "Requesting to facebook " + likes[0].attrib["fb-iframe-plugin-query"] url = likes[0].attrib["fb-iframe-plugin-query"] req = urllib2.Request(url=url, headers={"User-Agent": constants.USER_AGENT}) response = urllib2.urlopen(req) html_parser = etree.HTMLParser() tree = etree.parse(response, html_parser) likes_count = tree.xpath("/html//span[@class='pluginCountTextDisconnected']") if likes_count: print "Likes " + likes_count[0].text count = likes_count[0].text else: count = 0 print count return count def save_joke_to_db(self, joke_data, category): joke_id = self.get_joke_id(joke_data["title"]) if joke_id is None: print "New Joke. Inserting into db" self.insert_joke_in_db(joke_data) elif joke_data["title"] not in self.jokes_set: print "New scanning of joke" self.update_joke_in_db(joke_id, joke_data) self.jokes_set.add(joke_data["title"]) if not self.joke_in_category(joke_id, category): print "Joke not present in category " + category["name"] self.add_joke_to_category(joke_data, category) def get_joke_id(self, joke_title): print "Getting joke id" self.storage.connect() jokes = meta.tables["joke"] query = sqlalchemy.select([jokes.c.id]).where(jokes.c.title == joke_title) results = self.storage.execute(query) self.storage.disconnect() for result in results: print result["id"] return result["id"] return None def insert_joke_in_db(self, data): self.storage.insert("joke", data) def update_joke_in_db(self, joke_id, joke_data): self.storage.connect() jokes = meta.tables["joke"] query = sqlalchemy.update(jokes).where(jokes.c.id == joke_id).values(joke_data) print query self.storage.execute(query) self.storage.disconnect() def joke_in_category(self, joke_id, category): self.storage.connect() jokes_categories = meta.tables["joke_category"] query = sqlalchemy.select([jokes_categories.c.id]).where( and_(jokes_categories.c.joke_id == joke_id, jokes_categories.c.subcategory_id == category["id"]) ) results = self.storage.execute(query) self.storage.disconnect() for result in results: print result["id"] return True return False def add_joke_to_category(self, joke_data, category): joke_id = self.get_joke_id(joke_data["title"]) insert_data = {"joke_id": joke_id, "subcategory_id": category["id"]} self.storage.insert("jokes_categories", insert_data) def get_comments(self, url): # TODO find out how comments are loaded req = urllib2.Request(url="{0}/#discuss_thread".format(url), headers={"User-Agent": constants.USER_AGENT}) response = urllib2.urlopen(req) print response.read() # html_parser = etree.HTMLParser() # tree = etree.parse(response, html_parser) # # posts = tree.xpath(constants.COMMENTS_SELECTOR) # print posts # for post in posts: # author = post.xpath("//span[@class='author]")[0] # timestamp = post.xpath("//a[@class='time-ago']")[0] # content = post.xpath("//div[@class='post-message']/p[1]")[0] # # print author.text # print timestamp.text # print content.text def save_json(self, json_resp): self.create_directory() print "Saving json to disk..." with open(u"{0}/json/response.json".format(self.path), "w") as f: json.dump(json_resp, f) def parse(self, json_resp): authors = json_resp["d"]["Author"]["Result"] return authors def get_category_id(self, storage, name): storage.connect() categories = meta.tables["category"] query = sqlalchemy.select([categories.c.id]).where(categories.c.name == name) results = storage.execute(query) storage.disconnect() for result in results: print result["id"] return result["id"] def save_to_db(self): storage = Storage() print "Saving categories to db..." map(lambda category: self.save_category_to_db(category, storage), self.categories) def save_category_to_db(self, category, storage): storage.connect() categories_table = meta.tables["category"] insert_data = {"name": category["name"], "url": category["url"]} query = sqlalchemy.insert(categories_table, insert_data) print query storage.execute(query) storage.disconnect() category_id = self.get_category_id(storage, category["name"]) storage.connect() subcategories_table = meta.tables["subcategory"] for subcategory in category["subcategories"]: query = sqlalchemy.insert( subcategories_table, {"name": subcategory["name"], "url": subcategory["url"], "category_id": category_id}, ) print query storage.execute(query) storage.disconnect() def print_unicode(self, string): try: print string except UnicodeEncodeError as e: print u"Unicode Error: {0}".format(e.encoding)
class JokesXmlCreator: def __init__(self, out_dir): self.out_dir = out_dir self.storage = Storage() def save_jokes(self): jokes = self.load_jokes_from_db() for idx, joke in enumerate([joke for joke in jokes if not joke['content'].isspace()]): self.save_joke(idx, joke) def load_jokes_from_db(self): self.storage.connect() jokes = meta.tables['joke'] query = sqlalchemy.select([jokes.c.id, jokes.c.title, jokes.c.content, jokes.c.rating, jokes.c.votes, jokes.c.url, jokes.c.comments_count]) results = self.storage.execute(query) self.storage.disconnect() return results def save_joke(self, idx, joke): print "Saving joke " + joke["title"] self.storage.connect() joke_data = dict(joke.items()) joke_data['categories'] = [] jokes_categories = meta.tables['joke_category'] subcategories = meta.tables['subcategory'] categories = meta.tables['category'] query = sqlalchemy.select([subcategories.c.id.label("subcategory_id"), categories.c.id.label("category_id"), subcategories.c.name.label("subcategory_name"), categories.c.name.label("category_name")]).where( and_( joke["id"] == jokes_categories.c.joke_id, jokes_categories.c.subcategory_id == subcategories.c.id, subcategories.c.category_id == categories.c.id ) ) print query results = self.storage.execute(query) self.storage.disconnect() for result in results: print result joke_data['categories'].append({'category_id': str(result['category_id']), 'subcategory_id': str(result['subcategory_id']), 'category_name': result['category_name'], 'subcategory_name': result['subcategory_name']}) print joke_data self.save_joke_on_disk(idx, joke_data) def save_joke_on_disk(self, idx, joke): if not os.path.exists(self.out_dir): os.makedirs(self.out_dir) with open("{0}/joke-{1}.json".format(self.out_dir, idx), "w") as f: data = json.dumps(joke, encoding='utf-8', ensure_ascii=False) f.write(data.encode("utf-8")) with open("{0}/joke-{1}.xml".format(self.out_dir, idx), "w") as f: dicttoxml.set_debug() joke['votes'] = str(joke['votes']) joke['id'] = str(joke['id']) joke['comments_count'] = str(joke['comments_count']) xml = dicttoxml.dicttoxml(joke) f.write(xml.encode('utf-8'))
def __init__(self, out_dir): self.out_dir = out_dir self.storage = Storage()
def save_to_db(self, authors): storage = Storage() storage.connect() print "Saving authors to db..." map(lambda author: self.save_author_to_db(author, storage), authors) storage.disconnect()