Пример #1
0
def execute_page_rank(url_set,
                      graph_index,
                      graph_type,
                      out_file,
                      reverse_map=False):
    es_util = ElasticSearchUtility()
    web_graph = es_util.get_web_graph(graph_index, graph_type)
    page_rank_dict = page_rank(url_set, web_graph)

    # clear memory
    web_graph = None

    sorted_tuples = sorted(page_rank_dict.items(),
                           key=lambda x: x[1],
                           reverse=True)[:OUTPUT_SIZE]
    # clear memory
    page_rank_dict = None

    if reverse_map:
        print 'getting reverse url map...'
        url_reverse_map = Mapper.fromFile(MAPPING_FILE_NAME,
                                          reverse=True).mappings

        decoded_tuples = []
        for t in sorted_tuples:
            decoded_url = url_reverse_map[t[0]]  # decode url
            score = t[1]  # score as it is
            decoded_tuple = (decoded_url, score)
            decoded_tuples.append(decoded_tuple)
    else:
        decoded_tuples = sorted_tuples

    print 'writing pagerank results...'
    write(out_file, decoded_tuples)
Пример #2
0
def execute_hits(crawl_index_name, crawl_index_type, graph_index, graph_type):
    es_util = ElasticSearchUtility()
    web_graph = es_util.get_web_graph(graph_index, graph_type)
    link_map = Mapper.fromFile(MAPPING_FILE_NAME).mappings
    hubs, authorities = hits(crawl_index_name, crawl_index_type, web_graph,
                             QUERY_STRING, link_map)

    # clear memory
    web_graph = None
    link_map = None

    print 'sorting hubs...'
    sorted_hubs = sorted(hubs.items(), key=lambda x: x[1],
                         reverse=True)[:OUTPUT_SIZE]

    # clear memory
    hubs = None

    print 'sorting authorities...'
    sorted_auth = sorted(authorities.items(), key=lambda x: x[1],
                         reverse=True)[:OUTPUT_SIZE]

    # clear memory
    authorities = None

    print 'getting reverse url map...'
    url_reverse_map = Mapper.fromFile(MAPPING_FILE_NAME, reverse=True).mappings

    sorted_hubs_decoded = []
    for t in sorted_hubs:
        decoded_url = url_reverse_map[t[0]]  # decode url
        score = t[1]  # score as it is
        decoded_tuple = (decoded_url, score)
        sorted_hubs_decoded.append(decoded_tuple)

    sorted_auth_decoded = []
    for t in sorted_auth:
        decoded_url = url_reverse_map[t[0]]  # decode url
        score = t[1]  # score as it is
        decoded_tuple = (decoded_url, score)
        sorted_auth_decoded.append(decoded_tuple)

    print 'writing hubs...'
    write(HUBS_PATH, sorted_hubs_decoded)
    print 'writing authorities...'
    write(AUTH_PATH, sorted_auth_decoded)
def create_encoded_graph():
    es_util = ElasticSearchUtility()
    # mapper = Mapper()

    es_util.create_index(WEB_GRAPH_INDEX, CREATE_WEB_GRAPH)
    # es_util.create_index(ENCODED_LINKS_INDEX, CREATE_ENCODED_LINKS)
    # es_util.create_links_map(LINKS_INDEX, LINKS_TYPE)
    # mapper = None
    link_map = Mapper.fromFile(MAPPING_FILE_NAME).mappings
    # es_util.encode_crawled_links(LINKS_INDEX, LINKS_TYPE, link_map, ENCODED_LINKS_INDEX, ENCODED_LINKS_TYPE)
    es_util.encoded_links_to_web_graph(LINKS_INDEX, LINKS_TYPE,
                                       WEB_GRAPH_INDEX, WEB_GRAPH_TYPE,
                                       link_map)
    def get_all_ids(self, index_name, index_type):
        """
        Returns all ids of given index

        :param index_name: Name of the index
        :param index_type: Type of the index
        :return: List of ids of entire index
        """
        # query scroll
        id_list = []
        link_map = Mapper.fromFile(MAPPING_FILE_NAME).mappings

        scroll = self.es.search(index=index_name,
                                doc_type=index_type,
                                scroll='10m',
                                size=10000,
                                fields=['_id'],
                                body={"query": {
                                    "match_all": {}
                                }})
        scroll_size = scroll['hits']['total']
        size = 0
        # retrieve results
        while scroll_size > 0:
            # scrolled data is in scroll['hits']['hits']
            hits_list = scroll['hits']['hits']
            for hit in hits_list:
                url = hit['_id']
                encoded_id = link_map[iri_to_uri(url)]
                id_list.append(encoded_id)
            # update scroll size
            scroll_size = len(scroll['hits']['hits'])
            size += scroll_size
            print "scrolled %s \n" % size
            # prepare next scroll
            scroll_id = scroll['_scroll_id']
            # perform next scroll
            scroll = self.es.scroll(scroll_id=scroll_id, scroll='10m')
        return id_list
Пример #5
0
 def __bootstrap(self):
     self.mapper = Mapper.fromFile(self.index)
     self.catalog = Catalog.fromFile(self.index)
     self.reader = InvertedIndexReader(
         self.catalog, '/Users/admin/Documents/CS6200/HW2/Index/Indices/')
Пример #6
0
			{
				'id': (outlinks, inlinks)
			}
		'''

        transformed_hits = {}
        for hit in hits:
            ID = hit['_id']
            outlinks = hit['_source']['outlinks']
            inlinks = hit['_source']['inlinks']

            transformed_hits[ID] = (outlinks, inlinks)

        return transformed_hits

    def __merge_dicts(self, dict1, dict2):
        merged = dict1.copy()
        merged.update(dict2)
        return merged


if __name__ == '__main__':
    client = ESClient()
    mapper = Mapper.fromFile('link_map')
    pages = client.getCrawledPages(mapper)
    print "Writing to file"
    with open('/Users/admin/Documents/CS6200/HW4/pages', 'w') as f:
        for page in pages:
            f.write(page + '\n')
    print "DONE!"