示例#1
0
class Producer:
    def __init__(self, config):
        self._properties = self._get_properties(config)
        self._airflow = config['airflow']
        self._db = DatabaseConnector(config)

    def produce_to_kafka(self, data):
        topic = self._properties['producer']['topic']
        producer = KafkaProducer(**self._properties['producer'])
        # todo: add "locally" logging -> "Sending data to kafka.."
        f = producer.send(topic=topic, value=data) \
            .add_callback(self._on_success, **{'data': data}) \
            .add_errback(self._on_failure, **{'data': data})

    def run(self):
        pass

    def _on_success(self, metadata, data=None) -> None:
        # todo: add "locally" logging -> f"Successfully sent message: {metadata}"
        self._db.log_to_db('INFO', data, '')

    def _on_failure(self, exception, data=None) -> None:
        # todo: add "locally" logging -> f"Error while attempting to send message to kafka: {ex}"
        self._db.log_to_db('ERROR', data, str(exception))


    @staticmethod
    def _get_properties(config):
        properties = {
            'producer': {
                'topics': config['kafka']['topic'],
                'bootstrap_servers': config['kafka']['host'],
                'key_serializer': None,
                'value_serializer': None,
                'acks': 'all'
            },
        }
        if config['kafka']['protocol'].lower() == 'ssl':
            properties['common_client'] = 'SSL',
            properties['ssl'] = {
                'ssl_endpoint_identification_algorithm': '',
                'ssl_truststore_location': config['kafka']['truststoreLocation'],
                'ssl_keystore_location': config['kafka']['keystoreLocation'],
                'ssl_truststore_password': config['kafka']['truststorePassword'],
                'ssl_keystore_password': config['kafka']['keystorePassword'],
                'ssl_key_password': config['kafka']['keyPassword']
            }

        return properties
示例#2
0
import requests, time, os, math, threading
from datetime import datetime, timedelta, timezone, time as dt_time
from dateutil import parser
from dotenv import load_dotenv
from team import Team
from image import build_image
from twitter import TwitterClient
from sms import SMSClient
from db import DatabaseConnector
from prediction import Prediction
from utils import headers

twitter_client = TwitterClient()
sms_client = SMSClient()
db_connector = DatabaseConnector()

load_dotenv()
dirname = os.path.dirname(__file__)
LEAGUE_ID = os.getenv("LEAGUE_ID")
LEAGUE_ID_PREV = os.getenv("LEAGUE_ID_PREV")


def get_utc_timestamp():
    return int(datetime.utcnow().timestamp())


def current_utc_day():
    return datetime.utcnow().replace(hour=0, minute=0, second=0, microsecond=0)


def seconds_to(time):
示例#3
0
 def __init__(self, person, init_page):
     self.person = person
     self.init_soup = self.request_init_page(init_page)
     self.db = DatabaseConnector(Config.DATABASE)
示例#4
0
class PortalScraper():
    def __init__(self, person, init_page):
        self.person = person
        self.init_soup = self.request_init_page(init_page)
        self.db = DatabaseConnector(Config.DATABASE)

    def request_init_page(self, init_page):
        # request portal.fo
        res = requests.get(init_page)
        #read text and using bs4 html parser
        soup = BeautifulSoup(res.text, 'html.parser')
        return soup

    def get_person_article_links(self):
        """ 
            Reads through the init page (http://portal.fo/seinastu+vidmerkingarnar.html)
            And gets every comment of search person and stores the link to article in which 
            the comment was given in a Set.
            Returns: Set if page is changed, and None if nothing has changed.
        """
        comment_items = self.init_soup.find_all("div", class_="comment_item")
        hash_checker = HashChecker()
        hash_checker.generate_hash(comment_items)

        # Will compare with the hash saved in prev_hash.txt
        page_has_changed = hash_checker.hash_compare()
        if page_has_changed:
            hash_checker.save_new_hash()
            search_person_article_links = set()
            for comment in comment_items:
                commenter_name = comment.find(
                    class_="comment_profilename").text
                if (commenter_name == self.person):
                    search_person_article_links.add(comment.parent.get('href'))
            return search_person_article_links
        else:
            return None

    def scrape_articles(self, articles):
        # test = 0
        for article in articles:
            # test_file = "test-" + str(test) + ".txt"
            comment_section_soup = self.get_comment_section(article)
            self.extract_comment_data(comment_section_soup, article)
            '''
            with open(test_file, "w", encoding="utf-8") as fo:
                fo.write(str(comment_section_soup.prettify()))
            test = test + 1
            '''

    def get_comment_section(self, article):
        """
            -- This method is only meant to be used in this file --
            The Facebook Comments Plugin is loaded with Javascript, so we can't use the
            request module to read the articles, because it only gets static server HTML.
            This method uses Selenium, so we can wait for the plugin to have been loaded
            Returns: Soup for each article comment section (BeautifulSoup object)
        """
        driver = Config.get_driver()
        driver.get(article)
        timeout = 10
        try:
            # First we have to wait until the page is fully loaded. Using selenium and WebDriverWait to do that
            # Facebook Comments plugin is loaded via Javascript, so we cant use the request module to simply read the page
            element_present = EC.presence_of_element_located(
                (By.CLASS_NAME, 'fb_iframe_widget'))
            WebDriverWait(driver, timeout).until(element_present)

            # wait for fb_iframe_widget_loader to disappear
            self.wait_until_disappeared(driver, 'fb_iframe_widget_loader')

            # Now the Facebook plugin has been loaded
            # First get innerHTML of the page and use BeautifulSoup HTML parser so that we can work with it
            innerHTML = driver.execute_script(
                "return document.body.innerHTML"
            )  #returns the inner HTML as a string
            soup_comments = BeautifulSoup(innerHTML, 'html.parser')

            # This is the Facebook comments plugin which is an iframe
            facebook_plugin_iframe = soup_comments.find('iframe',
                                                        class_="fb_ltr")
            frame_id = facebook_plugin_iframe.get('id')

            # Because we need to work with another iframe, we need to change the frame
            # First set the current frame of the driver to the default
            # Then switch to iframe with the id we got from the Facebook comments plugin (line 29)
            # Then get innerHTML of the iframe and use BeautifulSoup so that we can work with it
            driver.switch_to_default_content()
            driver.switch_to.frame(frame_id)
            self.press_load_more_comments_if_present(driver)
            self.press_open_replies_if_present(driver)
            iframe_innerhtml = driver.execute_script(
                "return document.body.innerHTML"
            )  #returns the inner HTML as a string
            iframe_soup = BeautifulSoup(iframe_innerhtml, 'html.parser')
            return iframe_soup

        except TimeoutException:
            print("Timed out waiting for page to load")

    def wait_until_disappeared(self, driver, element):
        timeout = 10
        try:
            element = WebDriverWait(driver, timeout).until(
                EC.invisibility_of_element_located((By.CLASS_NAME, element)))
        except TimeoutException:
            print("Timed out waiting for element to disappear")

    def press_load_more_comments_if_present(self, driver):
        load_more_buttons = driver.find_elements_by_xpath(
            "//*[contains(text(), 'more comments')]")
        for load_button in load_more_buttons:
            # Navigate one level up to the anchor tag
            driver.execute_script("arguments[0].scrollIntoView();",
                                  load_button)
            load_button.click()

    def press_open_replies_if_present(self, driver):
        """
            -- This method is only meant to be used in this file --
        """
        span_show_more_replies = driver.find_elements_by_xpath(
            "//*[contains(text(), 'more replies in this thread') or contains(text(), 'more reply in this thread')]"
        )
        for span_tag in span_show_more_replies:
            # Navigate one level up to the anchor tag
            anchor_clickable = span_tag.find_element_by_xpath('..')
            driver.execute_script("arguments[0].scrollIntoView();",
                                  anchor_clickable)
            anchor_clickable.click()

        # Wait until all loading spans are gone.
        # The presence of them means that the plugin is loading the comments
        timeout = 10
        try:
            element = WebDriverWait(driver, timeout).until(
                EC.invisibility_of_element_located(
                    (By.XPATH, "//span[@aria-valuetext='Loading...']")))
        except TimeoutException:
            print("Timed out waiting for element to disappear")

    def extract_comment_data(self, comment_section_soup, article):
        comment_divs = comment_section_soup.find_all(
            class_='UFICommentActorName')
        for comment_div in comment_divs:
            # Get commenter name and compare it with the person we are searching for
            commenter_name = comment_div.text
            if (commenter_name == self.person):
                print('This is ', self.person)
                person_dict = {}

                # Traverse to parent span, so that we can traverse to the other divs from here
                # PARENT
                parent_span = comment_div.parent

                # GO TO TOP SIBLING OF PARENT
                # Go to the next sibling of the parent span. This is where the comment is located
                comment_sibling_div = parent_span.find_next_sibling()
                # print(comment_sibling_div)
                comment_text = comment_sibling_div.text

                # GO TO TOP SIBLING OF COMMENT_SIBLING
                # Div that contains lin to comment and time of comment
                like_time_sibling_div = comment_sibling_div.find_next_sibling()
                # print('Hey', like_time_sibling_div.prettify())

                # Check if the i tag exists. Then there are likes
                likes = ''
                for child in like_time_sibling_div.children:
                    itag = child.find('i')
                    if itag:
                        likes = child.text

                comment_utime = like_time_sibling_div.find(
                    "abbr", {
                        "class": "UFISutroCommentTimestamp"
                    }).get('data-utime')
                comment_timestamp = self.utime_to_timespamp(comment_utime)

                person_dict['name'] = commenter_name
                person_dict['text'] = comment_text
                person_dict['article'] = article
                person_dict['likes'] = likes
                person_dict['comment_timestamp'] = comment_timestamp

                self.db.insert_comment(person_dict)

    def utime_to_timespamp(self, utime):
        return datetime.datetime.fromtimestamp(
            int(utime)).strftime('%Y-%m-%d %H:%M:%S')

    def __repr__(self):
        return "Search person: %s" % (self.init_soup)
示例#5
0
 def __init__(self, config):
     self._airflow = config['airflow']
     self._properties = self._get_properties(config)
     self._db = DatabaseConnector(config)
示例#6
0
class Consumer:
    def __init__(self, config):
        self._airflow = config['airflow']
        self._properties = self._get_properties(config)
        self._db = DatabaseConnector(config)

    def run(self) -> None:
        self._consume_from_kafka()

    def _consume_from_kafka(self) -> None:
        # todo: handle ssl parameters
        consumer = KafkaConsumer(**self._properties['consumer'])
        # todo: logging consumer initialization

        for record in consumer:
            print(f'Got next message: {record}')
            # todo: logging "locally record.value input"
            self._db.log_to_db('INFO', record.value, '')
            self.http_post(record.value)

    def http_post(self, data: str) -> None:
        # todo: logging "locally" -> f'Triggering DAG with the following data: {data}'
        dag_name = self._db.get_config_from_db(self.get_message_type(data=data))
        airflow_url = f'{self._airflow}/api/experimental/dags/{dag_name}/dag_runs'

        response = requests.post(
            url=airflow_url,
            data={},
            headers={
                'Content-Type': 'application/json',
                'Cache-Control': 'no-cache'
            })

        if response.ok:
            # todo: logging "locally" -> "Triggered Airflow DAG successfully"
            self._db.log_to_db('INFO', data, '')
        else:
            # todo: logging "locally" -> "Error while triggering Airflow DAG"
            self._db.log_to_db('ERROR', data, 'Error while triggering Airflow DAG')

    # todo: check
    @staticmethod
    def get_message_type(data: str) -> str:
        message_type = json.loads()['message_type']
        # todo: add "locally" logging -> f"Got message_type from kafka: {message_type}"
        return message_type

    @staticmethod
    def _get_properties(config):
        properties = {
            'consumer': {
                'topics': config['kafka']['topic'],
                'bootstrap_servers': config['kafka']['host'],
                'group_id': config['kafka']['groupid'],
                'key_deserializer': None,
                'value_deserializer': None,
                'enable_auto_commit': True,
                'auto_offset_reset': 'latest'
            },
        }
        if config['kafka']['protocol'].lower() == 'ssl':
            properties['common_client'] = 'SSL',
            properties['ssl'] = {
                'ssl_endpoint_identification_algorithm': '',
                'ssl_truststore_location': config['kafka']['truststoreLocation'],
                'ssl_keystore_location': config['kafka']['keystoreLocation'],
                'ssl_truststore_password': config['kafka']['truststorePassword'],
                'ssl_keystore_password': config['kafka']['keystorePassword'],
                'ssl_key_password': config['kafka']['keyPassword']
            }

        return properties
示例#7
0
        },
        "password": {
            "type": "string",
        },
        "spotifyUsername": {
            "type": "string",
        },
       
    },
    "required": ["email", "password"],
    "additionalProperties": False
}


# Create instance of DatabaseConnector
databaseConnection = DatabaseConnector.DatabaseConnector()

# Create the Flask application and tell it where to look to serve HTML files
application = Flask(__name__, template_folder='react-frontend/templates', static_folder='react-frontend/static')

# Prepare the mongo instance
application.config["MONGO_URI"] = databaseConnection.getURI()
application.config['JWT_ACCESS_TOKEN_EXPIRES'] = datetime.timedelta(days=1)
application.config['PROPAGATE_EXCEPTIONS'] = True
application.config['SECRET_KEY'] = "'\xe9\xa5'"

# Create the Mongo object with our Flask application
mongo = PyMongo(application)
flask_bcrypt = Bcrypt(application)
jwt = JWTManager(application)