#!/usr/bin/env python import logging import logging.handlers import os import sys import chatexchange.client import chatexchange.events from database.Elastic import ElasticManager import time from tools.Logger import get_logger logger = get_logger("chat_scraper") def main(): host_id = 'stackexchange.com' room_id = '151' # Charcoal Chatbot Sandbox if 'ChatExchangeU' in os.environ: email = os.environ['ChatExchangeU'] else: email = raw_input("Email: ") if 'ChatExchangeP' in os.environ: password = os.environ['ChatExchangeP'] else: password = raw_input("Password: ") client = chatexchange.client.Client(host_id) client.login(email, password) room = client.get_room(room_id) room.join()
__author__ = 'lucas' from elasticsearch import Elasticsearch from config import ELASTIC_CLUSTER from tools.Logger import get_logger logger = get_logger("Elastic") class ESSessionManager(object): es_session = Elasticsearch(ELASTIC_CLUSTER) def __init__(self): pass class ElasticManager(object): @staticmethod def index_messages(message_list): es = ESSessionManager().es_session for message in message_list: try: logger.debug(es.index(index="secse", doc_type="monologue", id=message["id"], body=message)) except Exception,e: logger.exception(e) @staticmethod def index_message(message): es = ESSessionManager().es_session try:
__author__ = 'lucas' from scraper.Transcript import TranscriptScraper import requests from database.Elastic import ElasticManager from tools.Logger import get_logger from time import sleep import os logger = get_logger("scrape_dmz") scraper = TranscriptScraper(151) #keep a list which contains all URLs we need to fetch and process process_list = set() #keep a list of URLs which have already been processed so we do not fetch the same page twice process_list.add(scraper.get_first_day()) processed_list = list() #change headers for SE so they know if I cause load headers = { 'User-Agent': 'ChatExchangeScraper - contact Lucas Kauffman', } x = 0 try: for root, dirs, files in os.walk("/home/lucas/dmz"): for file in files: if file.endswith(".html"): with open(os.path.join(root, file)) as FILE: response = FILE.read() #a monologue can contain several messages monologues = scraper.extract_monologues(response)
__author__ = 'lucas' from bs4 import BeautifulSoup import requests,logging from config import BASE_URL,TRANSCRIPT from tools.Logger import get_logger logger = get_logger("Transcript") class TranscriptScraper(object): def __init__(self,room_id): self.room_id = room_id def get_first_day(self): response = requests.get(BASE_URL + TRANSCRIPT + str(self.room_id)) soup = TranscriptScraper.get_bs(response.content) main_div = soup.find("div",{"id":"main"}) first_day_href = main_div.find('a')["href"] return BASE_URL+first_day_href @staticmethod def get_bs(content): try: return BeautifulSoup(content,"html5lib") except: return BeautifulSoup(content,"html.parser") def extract_messages_from_monologues(self, monologues_soups):