Exemplo n.º 1
0
    def run(self):
        global threads
        docs = get_docs(CORPUS_PATH)
        mapper = Mapper()
        i = 1

        for files in self.__chunk(docs):
            doc_contents = []
            for f in files:
                with open(f, 'r') as d:
                    doc_contents.append(d.read())

            doc_contents = ''.join(doc_contents)
            self.__run_batch(parse(doc_contents), i, mapper)
            i += 1

        print "Writing the mapper to file -------------------------------------"
        mapper.write(self.name)
        print "Writing DocLengths to file --------------------------------------"
        self.__writeDocLengths()

        while len(self.catalogs) != 1:
            print self.chunk_catalog()
            for pair in self.chunk_catalog():
                print pair
                if len(pair) != 2:
                    break
                else:
                    cat1 = self.catalogs[pair[0]]
                    cat2 = self.catalogs[pair[1]]
                    self.__merge(cat1, cat2, pair)

        print "Writing the catalog to file for later use -----------------------"
        Catalog.write(self.catalogs, self.name)
 def create_links_map(self, links_index, links_type):
     mapper = Mapper()
     # query scroll
     scroll = self.es.search(index=links_index,
                             doc_type=links_type,
                             scroll='10m',
                             size=10000,
                             body={"query": {
                                 "match_all": {}
                             }})
     scroll_size = scroll['hits']['total']
     size = 0
     # retrieve results
     while scroll_size > 0:
         # scrolled data is in scroll['hits']['hits']
         hits_list = scroll['hits']['hits']
         for hit in hits_list:
             src_link = hit['_source']['SRC_LINK']
             dst_link = hit['_source']['DST_LINK']
             mapper.map(src_link)
             mapper.map(dst_link)
         # update scroll size
         scroll_size = len(scroll['hits']['hits'])
         size += scroll_size
         print "scrolled %s \n" % size
         # prepare next scroll
         scroll_id = scroll['_scroll_id']
         # perform next scroll
         scroll = self.es.scroll(scroll_id=scroll_id, scroll='10m')
     mapper.write(MAPPINGS_PATH)