# Instanciate the traph traph = Traph(overwrite=True, folder='./scripts/data/', default_webentity_creation_rule=default_webentity_creation_rule, webentity_creation_rules=webentity_creation_rules) print '\n:: Simulate a crawl:' print ' - Create webentity for "s:http|h:com|h:professor|p:augustine|p:sycamore|"' professor_prefixes = [ 's:http|h:com|h:professor|p:augustine|p:sycamore|', 's:http|h:com|h:professor|h:www|p:augustine|p:sycamore|', 's:https|h:com|h:professor|p:augustine|p:sycamore|', 's:https|h:com|h:professor|h:www|p:augustine|p:sycamore|' ] report = traph.create_webentity(professor_prefixes) webentity_store.data['webentities'].update(report.created_webentities) print ' - Simulate page crawls with links to the list of target pages' use_index_batch_crawl = True if use_index_batch_crawl: data = {} for i in range(len(SOURCE_PAGES)): lru = SOURCE_PAGES[i] # build links links = [] for j in range(len(TARGET_PAGES)): if j % 4 == i:
webentity_store.data['webentities'] = {} # Instanciate the traph traph = Traph(overwrite=True, folder='./scripts/data/', default_webentity_creation_rule=default_webentity_creation_rule, webentity_creation_rules=webentity_creation_rules) print '\n:: Setup' print '- Create a "Twitter" webentity with the 4 prefix variations (WWW and HTTPS cases)' twitter_prefixes = [ 's:http|h:com|h:twitter|', 's:http|h:com|h:twitter|h:www|', 's:https|h:com|h:twitter|', 's:https|h:com|h:twitter|h:www|' ] report = traph.create_webentity(twitter_prefixes) webentity_store.data['webentities'].update(report.created_webentities) twitter_weid = report.created_webentities.keys()[0] # Used below print '- Create a "Ego" webentity with ego.com (4 prefixes) as well as a Twitter account (additional 4 prefixes)' ego_prefixes = [ 's:http|h:com|h:ego|', 's:http|h:com|h:ego|h:www|', 's:https|h:com|h:ego|', 's:https|h:com|h:ego|h:www|', 's:http|h:com|h:twitter|p:ego', 's:http|h:com|h:twitter|h:www|p:ego', 's:https|h:com|h:twitter|p:ego', 's:https|h:com|h:twitter|h:www|p:ego' ] report = traph.create_webentity(ego_prefixes) webentity_store.data['webentities'].update(report.created_webentities) ego_weid = report.created_webentities.keys()[0] # Used below print '- Create a "Cheese" webentity with cheese.ego.com, Tweets about cheese and cheese.fr (12 prefixes)'
# Instanciate the traph traph = Traph(overwrite=True, folder='./scripts/data/', default_webentity_creation_rule=default_webentity_creation_rule, webentity_creation_rules=webentity_creation_rules) # Step 1 print '\n:: Step 1 - Create a "Boeing" webentity with the 4 prefix variations (WWW and HTTPS cases).' print 'Expected: Creates the entity with the 4 prefixes. This is the typical use case.' boeing_prefixes = [ 's:http|h:com|h:boeing|', 's:http|h:com|h:boeing|h:www|', 's:https|h:com|h:boeing|', 's:https|h:com|h:boeing|h:www|' ] report = traph.create_webentity(boeing_prefixes) webentity_store.data['webentities'].update(report.created_webentities) boeing_weid = report.created_webentities.keys()[0] # Used for a step below print '\nResult - Existing webentities from Store:' for weid, prefixes in webentity_store.data['webentities'].items(): print ' - Webentity %s:' % (weid) for prefix in prefixes: print '\t\t' + prefix print '\nResult - Prefixes from Traph:' for node, lru in traph.webentity_prefix_iter(): print ' - (%s) \t%s' % (node.webentity(), lru) # Step 2 print '\n:: Step 2 - Create a "Airbus HTTPS" webentity with only 2 prefix variations (WWW case).'