Exemplo n.º 1
0
 def __init__(self, url, keywords, requested_by):
     self.requested_by = requested_by
     url = url + quote(keywords)
     super().__init__(url,
                      keywords,
                      '',
                      callback=self.parse_search_result,
                      requested_by=requested_by)
     self.lang = "fr"
     self.dbf = DBFace()
Exemplo n.º 2
0
class DiplomatScrapper(StaticScrapper):
    def __init__(self, url, keywords, requested_by=None):
        self.requested_by = requested_by
        url_args = {
            'key': "AIzaSyCVAXiUzRYsML1Pv6RwSG1gunmMikTzQqY",
            'rsz': "filtered_cse",
            'num': 10,
            'hl': "en",
            'prettyPrint': "false",
            'source': "gcsc",
            'gss': ".com",
            'sig': "d5630e36052d1355ead71530c29be9ea",
            'cx': "006972344228181832854:w07k6emi2wk",
            'cse_tok': "ABPF6HibCVLLP6-x8toeGUn5PJY3CrbCXw:1526812940946",
            "q": keywords
        }
        super().__init__(url,
                         keywords,
                         url_args,
                         callback=self.parse_search_result,
                         requested_by=requested_by)
        self.lang = "en"
        self.dbf = DBFace()

    def parse_search_result(self, url, page_content, keywords):
        print("The Diplomat: got {} chars".format(len(page_content)))
        result = json.loads(page_content)
        if 'error' in result.keys():
            print("TheDiplomatScrapper: " +
                  result['error']['errors'][0]['message'])
            print("obsolete cse_tok parameter ?")
        else:
            for i in result['results']:
                lnk = i['clicktrackUrl']
                query_part = urlparse(lnk).query
                query_comps = parse_qs(query_part)
                lnk = query_comps['q'][0]
                sc = StaticScrapper(lnk,
                                    keywords=keywords,
                                    callback=self.parse_page_content,
                                    requested_by=self.requested_by)
                sc.start()

    def parse_page_content(self, url, page_content, keywords):
        out_text = []
        soup = BeautifulSoup(page_content, "lxml")
        content_p = soup.find_all('div', {'itemprop': ['articleBody']})
        # print("Diplomat: found {} article elem".format(len(content_p)))
        for maincnt in content_p:
            for parag in maincnt.find_all('p'):
                pt = parag.get_text()
                out_text.append(pt)
        # print("read {} chars on {}".format(len(''.join(out_text)), url))
        self.dbf.add_record(keywords, url, ''.join(out_text), lang=self.lang)
Exemplo n.º 3
0
    def __init__(self, keywords, url=None, requested_by=None, run=True):

        self.keywords = keywords
        self.url = url
        self.requested_by = requested_by

        if run:
            Thread.__init__(self)

        self.dbf = DBFace()
        if requested_by is not None and callable(requested_by):
            requested_by(self)
Exemplo n.º 4
0
class GraphBolt(Bolt):
    outputs = ['info', 'graph_json']

    def initialize(self, conf, ctx):
        self.pid = os.getpid()
        self.db = DBFace()

    def process(self, tup):
        info = tup.values[0]
        wordcounts = tup.values[1]

        graph = GlobalGraph(wordcounts, logger=self.logger)
        graph_json = graph.to_json()

        self.db.insert_graph(graph_json)
        self.logger.info("graph inserted in db")

        self.emit([info, graph_json])
Exemplo n.º 5
0
 def __init__(self, url, keywords, requested_by=None):
     self.requested_by = requested_by
     url_args = {
         'key': "AIzaSyCVAXiUzRYsML1Pv6RwSG1gunmMikTzQqY",
         'rsz': "filtered_cse",
         'num': 10,
         'hl': "en",
         'prettyPrint': "false",
         'source': "gcsc",
         'gss': ".com",
         'sig': "d5630e36052d1355ead71530c29be9ea",
         'cx': "006972344228181832854:w07k6emi2wk",
         'cse_tok': "ABPF6HibCVLLP6-x8toeGUn5PJY3CrbCXw:1526812940946",
         "q": keywords
     }
     super().__init__(url,
                      keywords,
                      url_args,
                      callback=self.parse_search_result,
                      requested_by=requested_by)
     self.lang = "en"
     self.dbf = DBFace()
Exemplo n.º 6
0
 def __init__(self,
              url,
              keywords=None,
              url_args=None,
              callback=None,
              requested_by=None):
     Thread.__init__(self)
     # assert callable(callback) is True or callback is None
     self.url = url
     self.request_url = ''
     self.callback = callback
     self.url_args = url_args
     self.keywords = keywords
     self.dbf = DBFace()
     self.requested_by = requested_by
     if requested_by is not None and callable(requested_by):
         requested_by(self)
Exemplo n.º 7
0
 def __init__(self,
              url,
              keywords=None,
              url_args=None,
              callback=None,
              js=True,
              requested_by=None):
     Thread.__init__(self)
     self.requested_by = requested_by
     assert callable(callback) is True or callback is None
     self.url = url
     self.request_url = ''
     self.callback = callback
     self.url_args = url_args
     self.keywords = keywords
     self.js = js  # if true, use the selenium framework to get content generated from js
     self.dbf = DBFace()
     if requested_by is not None and callable(requested_by):
         requested_by(self)
Exemplo n.º 8
0
class FigaroStaticScrapper(StaticScrapper):
    def __init__(self, url, keywords, requested_by):
        self.requested_by = requested_by
        url = url + quote(keywords)
        super().__init__(url,
                         keywords,
                         '',
                         callback=self.parse_search_result,
                         requested_by=requested_by)
        self.lang = "fr"
        self.dbf = DBFace()

    def parse_search_result(self, url, page_content, keywords):
        # print("figaro received {}".format(len(page_content)))
        soup = BeautifulSoup(page_content, "lxml")
        resdivs = soup.find_all('section', {'class': ['fig-profil',\
                                                      'fig-profil-mtpd',\
                                                      'fig-profil-std',\
                                                      'univers-figaro-vox']})
        print("found {} results on figaro".format(len(resdivs)))
        for i in resdivs:
            lnk = i.find_all('a')[0].get('href')
            sc = StaticScrapper(lnk,
                                keywords=keywords,
                                callback=self.parse_page_content,
                                requested_by=self.requested_by)
            sc.start()

    def parse_page_content(self, url, page_content, keywords):
        out_text = []
        soup = BeautifulSoup(page_content, "lxml")
        content_p = soup.find_all('div', {'class': 'fig-content__body'})
        if len(content_p) == 0:
            # si pages sport
            content_p = soup.find_all('div', {'class': 's24-art-body'})
        if len(content_p) == 0:
            # si pages 'le particulier'
            content_p = soup.find_all('div', {'class': ['wysiwyg', 'classic']})
        if len(content_p) == 0:
            # si pages 'vin'
            content_p = soup.find_all('div', {'id': 'content-text'})
        if len(content_p) == 0:
            # pages 'figaro madame"
            content_p = soup.find_all('div', {'class': ['article-body',\
                                                        'mad__article__content__body',\
                                                        'selectionShareable']})
        if len(content_p) == 0:
            # pages 'economie'
            content_p = soup.find_all(
                'div', {'class': 'texte'})  # marche pas car len > 0

        if len(content_p) == 0:
            # pages l'etudiant
            content_p = soup.find_all('div', {'class': 'article__content'})

        for maincnt in content_p:
            for parag in maincnt.find_all('p'):
                # print(parag.get_text())
                out_text.append(parag.get_text())
        # print("read {} chars on {}".format(len(''.join(out_text)), url))
        self.dbf.add_record(keywords, url, ''.join(out_text), lang=self.lang)
Exemplo n.º 9
0
 def initialize(self, conf, ctx):
     self.pid = os.getpid()
     self.db = DBFace()
Exemplo n.º 10
0
import sys
from utils.DBFace import DBFace
import utils.Wordcount_methods as wcm
import utils.Graph as graph

if __name__ == '__main__':
    print("running on Python version {}".format(sys.version))
    keywords = "kim jong"

    # si argument fourni en ligne de commande
    if len(sys.argv) > 1:
        keywords = ' '.join(sys.argv[1:])

    congruence = Congruence()
    congruence.recursive_search(keywords,
                                keywords,
                                conf.RECURSIVE_DEPTH,
                                langs=['en'])

    dbf = DBFace()

    wordcounts = dbf.get_wordcounts(keywords)
    global_wordcount = wcm.aggregate_wordcount_dicts(wordcounts)

    # print(global_wordcount)

    g = graph.GlobalGraph(wordcounts, n=6)
    #print(g.to_json())
    g.to_dot()
Exemplo n.º 11
0
class StaticScrapper(Thread):
    def __init__(self, keywords, url=None, requested_by=None, run=True):

        self.keywords = keywords
        self.url = url
        self.requested_by = requested_by

        if run:
            Thread.__init__(self)

        self.dbf = DBFace()
        if requested_by is not None and callable(requested_by):
            requested_by(self)

    def search(self):
        #url_args = self.search_args
        search_params = self.make_search_params(self.keywords)
        page_content = self.fetch_url(search_params[0], search_params[1])
        links = self.parse_search_page(page_content)
        print("found links : {}".format(links))

        for lnk in links:
            sc = self.__class__(keywords=self.keywords,
                                url=lnk,
                                requested_by=self.requested_by)
            sc.start()

    def content_to_db(self):
        page_content = self.fetch_url(self.url)
        paragraphs = self.parse_page_content(page_content)

        # print("read {} chars on {}".format(len(''.join(out_text)), url))
        self.dbf.add_record(self.keywords,
                            self.url,
                            paragraphs,
                            lang=self.lang)

    @classmethod
    def get_search_results(cls, keywords):
        search_params = cls.make_search_params(keywords)
        search_page = cls.fetch_url(search_params[0], search_params[1])
        search_results = cls.parse_search_page(search_page)
        return search_results

    @classmethod
    def get_scrap_results(cls, url):
        article_raw = cls.fetch_url(url)
        article_content = cls.parse_page_content(article_raw)
        return article_content

    @staticmethod
    def fetch_url(url, url_args=None):
        try:
            encoded_args = urlencode(url_args)
            request_url = url + encoded_args
        except TypeError as te:
            request_url = url
            pass
        try:
            print("requesting url : ", request_url)
            r = requests.get(request_url)
            return r.text
        except requests.exceptions.ConnectionError as ce:
            print(ce)
        except urllib3.exceptions.MaxRetryError as mre:
            print(mre)

    def run(self):
        if self.url is None:
            self.search()
        else:
            self.content_to_db()
Exemplo n.º 12
0
 def __init__(self):
     self.threads = []
     self.keywords = ''
     self.dbf = DBFace()
Exemplo n.º 13
0
class Congruence:
    def __init__(self):
        self.threads = []
        self.keywords = ''
        self.dbf = DBFace()

    def run(self, keywords):

        print("running Congruence with keyword {}".format(keywords))
        self.keywords = keywords
        self.recursive_search(self.keywords, self.keywords, 1, langs=['en'])
        wordcounts = self.dbf.get_wordcounts(self.keywords)
        self.g = graph.GlobalGraph(wordcounts, n=6)
        lgg = self.g.to_json()
        self.dbf.insert_graph(lgg)
        g = self.dbf.get_graph()
        # jstr = json.dumps(g)
        # return str(g)
        return g

    def get_db(self):
        return self.dbf

    def thread_accumulator(self, thread):
        # print("started thread {}".format(thread))
        self.threads.append(thread)

    def run_scrappers(self, keywords, langs):

        if 'fr' in langs:
            if conf.SCRAPPERS_VERSION == 1:
                ns = NouvelobsStaticScrapper(
                    "https://recherche.nouvelobs.com/?", keywords,
                    self.thread_accumulator)
                ls = LiberationStaticScrapper(
                    "http://www.liberation.fr/recherche/?", keywords,
                    self.thread_accumulator)
                fs = FigaroStaticScrapper(
                    "http://recherche.lefigaro.fr/recherche/", keywords,
                    self.thread_accumulator)
            else:
                ns = NouvelobsStaticScrapper(
                    keywords, requested_by=self.thread_accumulator)
                ls = LiberationStaticScrapper(
                    keywords, requested_by=self.thread_accumulator)
                fs = FigaroStaticScrapper(keywords,
                                          requested_by=self.thread_accumulator)

            ls.start()
            ns.start()
            fs.start()

        if 'en' in langs:
            if conf.SCRAPPERS_VERSION == 1:
                nys = NYTScrapper("https://www.nytimes.com/search/", keywords,
                                  self.thread_accumulator)
                bbs = BBCScrapper("https://www.bbc.co.uk/search?", keywords,
                                  self.thread_accumulator)
                # cnn = CNNScrapper("https://edition.cnn.com/search/?", keywords, self.thread_accumulator)
                dps = DiplomatScrapper(
                    'https://www.googleapis.com/customsearch/v1element?',
                    keywords, self.thread_accumulator)
            else:
                nys = NYTScrapper(keywords,
                                  requested_by=self.thread_accumulator)
                bbs = BBCScrapper(keywords,
                                  requested_by=self.thread_accumulator)
                # cnn = CNNScrapper(keywords, self.thread_accumulator)
                dps = DiplomatScrapper(keywords,
                                       requested_by=self.thread_accumulator)
                # tis = TheInterceptScrapper(keywords, requested_by=self.thread_accumulator)

            nys.start()
            bbs.start()
            # cnn.start()
            dps.start()
            #tis.start()

        for t in self.threads:
            t.join()

    def recursive_search(self,
                         initial_keywords,
                         current_keywords,
                         depth,
                         langs=['en']):
        if depth == 0:
            return None

        analyser = Analyser(conf.NLP_HOST, conf.NLP_PORT)

        print(
            "running recursive search at depth {} for keyword {} from initial keyword {}"
            .format(depth, current_keywords, initial_keywords))
        self.run_scrappers(current_keywords, langs=['en'])

        fwst = self.dbf.find_with_search_term(current_keywords)
        print("found {} document{} originating from keyword {}".format(
            len(fwst), '' if len(fwst) <= 1 else 's', current_keywords))

        fwc = self.dbf.find_with_content(self.keywords, exact=True)
        print("found {} document{} containing text {}".format(
            len(fwc), '' if len(fwc) <= 1 else 's', current_keywords))

        notk = self.dbf.find_tokenifiable(langs=["en"])
        nowc = self.dbf.find_wordcountable(langs=["en"])
        print("found {} tokenifiable doc{}".format(
            notk.count(), '' if notk.count() == 1 else 's'))
        print("found {} wordcountable doc{}".format(
            nowc.count(), '' if nowc.count() == 1 else 's'))
        self.dbf.batch_tokenify(notk, analyser)

        wordcounts = self.dbf.get_wordcounts(current_keywords)

        global_wordcount = wcm.aggregate_wordcount_dicts(wordcounts)
        # print(global_wordcount)

        global_wordcount_dict_best = {
            k: wcm.take_firsts(v, n=3)
            for k, v in global_wordcount.items()
            if k in ["PERSON", "ORGANIZATION"]
        }
        global_wordcount_best = wcm.aggregate_subjects(
            global_wordcount_dict_best)
        for token in global_wordcount_best:
            self.recursive_search(initial_keywords, token[0][0], depth - 1,
                                  langs)

    if __name__ == '__main__':
        print("running on Python version {}".format(sys.version))