Пример #1
0
#!/usr/bin/env python
import logging
import logging.handlers
import os
import sys
import chatexchange.client
import chatexchange.events
from database.Elastic import ElasticManager
import time
from tools.Logger import get_logger
logger = get_logger("chat_scraper")


def main():
    host_id = 'stackexchange.com'
    room_id = '151'  # Charcoal Chatbot Sandbox

    if 'ChatExchangeU' in os.environ:
        email = os.environ['ChatExchangeU']
    else:
        email = raw_input("Email: ")
    if 'ChatExchangeP' in os.environ:
        password = os.environ['ChatExchangeP']
    else:
        password = raw_input("Password: ")

    client = chatexchange.client.Client(host_id)
    client.login(email, password)

    room = client.get_room(room_id)
    room.join()
Пример #2
0
__author__ = 'lucas'

from elasticsearch import Elasticsearch
from config import ELASTIC_CLUSTER
from tools.Logger import get_logger

logger = get_logger("Elastic")

class ESSessionManager(object):

    es_session = Elasticsearch(ELASTIC_CLUSTER)

    def __init__(self):
        pass


class ElasticManager(object):

    @staticmethod
    def index_messages(message_list):
        es = ESSessionManager().es_session
        for message in message_list:
            try:
                logger.debug(es.index(index="secse", doc_type="monologue", id=message["id"], body=message))
            except Exception,e:
                logger.exception(e)

    @staticmethod
    def index_message(message):
        es = ESSessionManager().es_session
        try:
__author__ = 'lucas'
from scraper.Transcript import TranscriptScraper
import requests
from database.Elastic import ElasticManager
from tools.Logger import get_logger
from time import sleep
import os

logger = get_logger("scrape_dmz")
scraper = TranscriptScraper(151)
#keep a list which contains all URLs we need to fetch and process
process_list = set()
#keep a list of URLs which have already been processed so we do not fetch the same page twice
process_list.add(scraper.get_first_day())
processed_list = list()
#change headers for SE so they know if I cause load
headers = {
            'User-Agent': 'ChatExchangeScraper - contact Lucas Kauffman',
                }


x = 0

try:
		for root, dirs, files in os.walk("/home/lucas/dmz"):
			for file in files:
				if file.endswith(".html"):
					 with open(os.path.join(root, file)) as FILE:
						 response = FILE.read()
						 #a monologue can contain several messages
						 monologues = scraper.extract_monologues(response)
__author__ = 'lucas'
from bs4 import BeautifulSoup
import requests,logging
from config import BASE_URL,TRANSCRIPT
from tools.Logger import get_logger


logger = get_logger("Transcript")


class TranscriptScraper(object):

    def __init__(self,room_id):
        self.room_id = room_id

    def get_first_day(self):
        response = requests.get(BASE_URL + TRANSCRIPT + str(self.room_id))
        soup = TranscriptScraper.get_bs(response.content)
        main_div = soup.find("div",{"id":"main"})
        first_day_href = main_div.find('a')["href"]
        return BASE_URL+first_day_href
    
    @staticmethod		
    def get_bs(content):
	try: 
             return BeautifulSoup(content,"html5lib")
	except:
	     return BeautifulSoup(content,"html.parser")


    def extract_messages_from_monologues(self, monologues_soups):