def get_entities(trail_id): tangelo.log('Getting entities for trail: %s' % trail_id) entities = {} entityList = [] urls = [] rows = db.getBrowsePathUrls(trail_id) for row in rows: urls.append(row['url']) entity_data_connector = factory.get_entity_data_connector() results = entity_data_connector.get_extracted_entities_from_urls(urls) tangelo.log('Got entities') for result in results: for entityType in results[result]: for entityName in results[result][entityType]: if entityName in entities: entities[entityName]['pages'] = entities[entityName]['pages'] + 1 else: entities[entityName] = {'type': entityType, 'pages':1} # TODO either figure out how how map the data or do this differently for entity in entities: entityList.append({'name': entity, 'type': entities[entity]['type'], 'pages': entities[entity]['pages']}) return json.dumps(entityList)
def get_preview(domain): domain_content_connector = factory.get_entity_data_connector() try: data = domain_content_connector.get_domain_items(domain, 10) return json.dumps(data) finally: domain_content_connector.close()
def scrape_page(team_id, domain_id, trail_id, url, content, userEmail): content = urllib.unquote(content).encode('utf-8') url = url.encode('utf-8') connector = factory.get_entity_data_connector() (features, errors) = extractors.extractAll(content) for error in errors: tangelo.log("FEATURE EXTRACTION ERROR: " + error) for type, values in features.iteritems(): connector.insert_entities(url, type, values) if len(values) > 0: features_in_domain = connector.get_domain_entity_matches( domain_id, type, values) if len(features_in_domain) > 0: tangelo.log("INSERTING DOMAIN ENTITIES") tangelo.log(type) tangelo.log(features_in_domain) connector.insert_domain_entities(str(domain_id), url, type, features_in_domain) id = db.addBrowsePathData(team_id, domain_id, trail_id, url, userEmail) count = db.getUrlCount(team_id, domain_id, trail_id, url) result = dict(id=id, count=count) return json.dumps(result)
def scrape_page(team_id,domain_id,trail_id,url,content,userEmail): content = urllib.unquote(content).encode('utf-8') url = url.encode('utf-8') connector = factory.get_entity_data_connector() (features,errors) = extractors.extractAll(content) for error in errors: tangelo.log("FEATURE EXTRACTION ERROR: "+error) for type,values in features.iteritems(): connector.insert_entities(url,type,values) if len(values) > 0: features_in_domain = connector.get_domain_entity_matches(domain_id,type,values) if len(features_in_domain) > 0: tangelo.log("INSERTING DOMAIN ENTITIES") tangelo.log(type) tangelo.log(features_in_domain) connector.insert_domain_entities(str(domain_id),url, type, features_in_domain) id = db.addBrowsePathData(team_id,domain_id,trail_id,url, userEmail) count = db.getUrlCount(team_id,domain_id,trail_id, url) result = dict(id=id, count=count) return json.dumps(result)
def scrape_page(team_id,domain_id,trail_id,url,content,user_email): content = urllib.unquote(content).encode('utf-8') url = url.encode('utf-8') connector = factory.get_entity_data_connector() # blacklist of pages to not extract data from blacklist = config.get_extraction_blacklist() if urlparse(url).netloc not in blacklist: (features,errors) = extractors.extractAll(content) for error in errors: tangelo.log("FEATURE EXTRACTION ERROR: "+error) for type,values in features.iteritems(): connector.insert_entities(url,type,values) if len(values) > 0: features_in_domain = connector.get_domain_entity_matches(domain_id,type,values) if len(features_in_domain) > 0: tangelo.log("INSERTING DOMAIN ENTITIES") tangelo.log(type) connector.insert_domain_entities(str(domain_id),url, type, features_in_domain) # we also don't want to export blacklisted pages. tangelo.log("Calling export") export_to_services(domain_id, team_id, trail_id, url, content, user_email, features) else: tangelo.log("Url: %s IN blacklist"%url) id = db.addBrowsePathData(team_id,domain_id,trail_id,url, user_email) count = db.getUrlCount(team_id,domain_id,trail_id, url) result = dict(id=id, count=count) return json.dumps(result)
def scrape_page(html, url, userId, userName, trail, domain, org): #tangelo.log('USER NAME: ' + userName) domain = domain.encode('utf-8') org = org.encode('utf-8') html = urllib.unquote(html).encode('utf-8') url = url.encode('utf-8') connector = factory.get_entity_data_connector() (features,errors) = extractors.extractAll(html) tangelo.log(features) for type,values in features.iteritems(): connector.insert_entities(url,type,values) #for value in values: # tangelo.log("EXTRACTED: "+type+"\t"+value) if len(values) > 0: features_in_domain = connector.get_domain_entity_matches(domain,type,values) if len(features_in_domain) > 0: connector.insert_domain_entities(domain,url, type, features_in_domain) #tangelo.log("EXTRACTED "+str(len(features_in_domain))+" DOMAIN FEATURES") for error in errors: tangelo.log("FEATURE EXTRACTION ERROR: "+error) id = db.addBrowsePathData(org, url, userId, userName, trail, domain=domain) # get number of times this url appears in the database count = db.getUrlCount(org, url, domain=domain) result = dict(id=id, count=count) #tangelo.log("POSTED url:" + url + " return: " + str(result)) return json.dumps(result)
def upload_file(*args, **kwargs): domain_content_connector = factory.get_entity_data_connector() try: domain_file = kwargs.get("file_upload") domain_name = kwargs.get("name") domain_description = kwargs.get("description") if not db.domain_exists(domain_name): if domain_file is not None: tangelo.log("read domain file") domain_file_lines = domain_file.file.readlines() domain_file_lines = map(lambda x: x.strip().replace('\0',''), domain_file_lines) db.add_new_domain(domain_name, domain_description) rowkeys = [] for line in domain_file_lines: i = line.index(',') # split on the first comma type = line[:i] value = line[i+1:] if type[0] == '"' and type[len(type)-1] == '"': type = type[1:-1] if value[0] == '"' and value[len(value)-1] == '"': value = value[1:-1] rowkeys.append( domain_name+'\0'+type+'\0'+value ) result = domain_content_connector.add_new_domain_items(rowkeys) return json.dumps(dict(success=result)) else: return json.dumps(dict(success=False)) else: return json.dumps(dict(success=False)) finally: domain_content_connector.close()
def delete_domain(domain_name): if db.domain_exists(domain_name): domain_content_connector = factory.get_entity_data_connector() db.remove_domain(domain_name) domain_content_connector.delete_domain_items(domain_name) return json.dumps(dict(success=True)) return json.dumps(dict(success=False))
def upload_database_threaded(domain_name, connection_string, domain_description, table_name, attribute_column, value_column): domain_content_connector = factory.get_entity_data_connector() connector = ConnectorUtil.get_database_connector(connection_string, table_name, attribute_column, value_column) rows = connector.get_domain_items() success = domain_content_connector.add_new_domain_items(map(lambda items: "%s\0%s\0%s" % (domain_name, items[0], items[1]), rows)) complete_dict = dict(domain=domain_name, description=domain_description, success=success, complete=True) completed_threads.put(complete_dict)
def delete_domain(team_id, domain_id): tangelo.log("deleting domain: " + str(domain_id)) domain_content_connector = factory.get_entity_data_connector() domain_content_connector.delete_domain_items(domain_id) tangelo.log("delted features") db.remove_domain(domain_id) tangelo.log("delted meta data") return json.dumps(dict(success=True))
def get_preview(*args, **kwargs): domain_content_connector = factory.get_entity_data_connector() try: name = kwargs.get("domain") data = domain_content_connector.get_domain_items(name, 10) return json.dumps(data) finally: domain_content_connector.close()
def get_all_entities(url): entity_data_connector = None try: entity_data_connector = factory.get_entity_data_connector() all_entities = entity_data_connector.get_extracted_entities_from_urls([url]) entities = all_entities.get(url, {}) return json.dumps(entities) finally: entity_data_connector.close()
def get_domain_extracted_entities(url, domain): entity_data_connector = None try: entity_data_connector = factory.get_entity_data_connector() domain_entities = entity_data_connector.get_extracted_domain_entities_from_urls(domain, [url]) return json.dumps(dict(domainExtracted=domain_entities.get(url))) finally: if entity_data_connector is not None: entity_data_connector.close()
def get_all_entities(url): entity_data_connector = None try: entity_data_connector = factory.get_entity_data_connector() all_entities = entity_data_connector.get_extracted_entities_from_urls( [url]) entities = all_entities.get(url, {}) return json.dumps(entities) finally: entity_data_connector.close()
def get_domain_extracted_entities(url, domain_id, team_id): entity_data_connector = None try: entity_data_connector = factory.get_entity_data_connector() domain_entities = entity_data_connector.get_extracted_domain_entities_from_urls( domain_id, [url]) return json.dumps(domain_entities.get(url)) finally: if entity_data_connector is not None: entity_data_connector.close()
def get_all_entities(url, domain): entity_data_connector = None try: entity_data_connector = factory.get_entity_data_connector() all_entities = entity_data_connector.get_extracted_entities_from_urls([url]) domain_extracted = entity_data_connector.get_extracted_domain_entities_from_urls(domain, [url]) entities = dict(domainExtracted=domain_extracted.get(url, {}), allEntities=all_entities.get(url, {})) return json.dumps(entities) finally: entity_data_connector.close()
def get(team_id,domain_id,limit): """ Verify the logged in user has access to the requested team and return N items from the domain :param team_id: :return: List of domain features [{type:.., value:...},..] """ entity_data_connector = factory.get_entity_data_connector() results = entity_data_connector.get_domain_items(domain_id,limit) return json.dumps(results)
def delete_domain(*args, **kwargs): domain_name = kwargs.get("domain_name") for key in kwargs.keys(): tangelo.log(key) if db.domain_exists(domain_name): domain_content_connector = factory.get_entity_data_connector() db.remove_domain(domain_name) domain_content_connector.delete_domain_items(domain_name) return json.dumps(dict(success=True)) else: return json.dumps(dict(success=False))
def get(domain,trail,stars,newdomain): org = helper.get_org().upper() if not db.domain_exists(newdomain): db.add_new_domain(newdomain,'auto generated domain from trail: '+trail) features = set([]) url_set = set([]) stars = int(stars) # get all stared urls for the trail for (url,rank) in db.getRankedUrls(org,trail,domain): url_set.add(url) if stars < 1: urls = db.getBrowsePathUrls(org,trail) for url in urls: url_set.add(url) # get the list of invalid entities for the domain markedEntities = set([]) for (type,value) in db.get_marked_entities_for_domain(org, domain): markedEntities.add(value) # for each url get all extracted entities entity_data_connector = factory.get_entity_data_connector() all_entities = entity_data_connector.get_extracted_entities_from_urls(url_set) for url,featureDict in all_entities.iteritems(): for type,values in featureDict.iteritems(): type = type.replace(',',' ') filtered_values = [] for value in values: if value not in markedEntities: value = value.replace(',',' ') features.add(type+"\0"+value) # for each url get any manually extracted entities for url in url_set: for featureObj in db.get_feedback_entities(org, domain, url): type = featureObj['type'].replace(',',' ') value = featureObj['value'].replace(',',' ') features.add(type+"\0"+value) entity_data_connector.add_new_domain_items( map(lambda x: newdomain+'\0'+x,features))
def get_lookahead(url, srcurl, domain): entity_data_connector = None try: entity_data_connector = factory.get_entity_data_connector() # get the features from the lookahead url that are also on the src url matches = entity_data_connector.get_matching_entities_from_url([url, srcurl]) domain_matches = entity_data_connector.get_extracted_domain_entities_for_urls(domain, [url]) result = dict(url=url, matches=matches, domain_search_matches=domain_matches) return json.dumps(result) finally: if entity_data_connector is not None: entity_data_connector.close()
def get(domain, trail): org = helper.get_org() trail_report = {} # get all stared urls for the trail for (url, rank) in db.getRankedUrls(org, trail, domain): trail_report[url] = { 'rank': rank, } # get the list of invalid entities for the domain markedEntities = set([]) for (type, value) in db.get_marked_entities_for_domain(org, domain): markedEntities.add(value) # for each url get all extracted entities entity_data_connector = factory.get_entity_data_connector() all_entities = entity_data_connector.get_extracted_entities_from_urls( trail_report.keys()) for url, featureDict in all_entities.iteritems(): for type, values in featureDict.iteritems(): filtered_values = [] for value in values: if value not in markedEntities: filtered_values.append(value) if len(filtered_values) > 0: try: if 'auto_features' not in trail_report[url]: trail_report[url]['auto_features'] = {} trail_report[url]['auto_features'][type] = filtered_values except: tangelo.log("report generation error. skipping url.") continue # for each url get any manually extracted entities for url in trail_report.keys(): for featureObj in db.get_feedback_entities(org, domain, url): if 'manual_features' not in trail_report[url]: trail_report[url]['manual_features'] = {} if featureObj['type'] not in trail_report[url]['manual_features']: trail_report[url]['manual_features'][featureObj['type']] = [] trail_report[url]['manual_features'][featureObj['type']].append( featureObj['value']) # for each url get any highlighted text for url in trail_report.keys(): selections = db.getSelections(domain, trail, url, org) if len(selections) > 0: trail_report[url]['selections'] = selections result = {'trail': trail, 'urls': trail_report} return json.dumps(result, sort_keys=True, indent=4, separators=(',', ':'))
def get_all_entities(url, domain): entity_data_connector = None try: entity_data_connector = factory.get_entity_data_connector() all_entities = entity_data_connector.get_extracted_entities_from_urls( [url]) domain_extracted = entity_data_connector.get_extracted_domain_entities_from_urls( domain, [url]) entities = dict(domainExtracted=domain_extracted.get(url, {}), allEntities=all_entities.get(url, {})) return json.dumps(entities) finally: entity_data_connector.close()
def get_lookahead(url, srcurl, domain): entity_data_connector = None try: entity_data_connector = factory.get_entity_data_connector() # get the features from the lookahead url that are also on the src url matches = entity_data_connector.get_matching_entities_from_url( [url, srcurl]) domain_matches = entity_data_connector.get_extracted_domain_entities_for_urls( domain, [url]) result = dict(url=url, matches=matches, domain_search_matches=domain_matches) return json.dumps(result) finally: if entity_data_connector is not None: entity_data_connector.close()
def get(domain, trail, stars, newdomain): org = helper.get_org().upper() if not db.domain_exists(newdomain): db.add_new_domain(newdomain, 'auto generated domain from trail: ' + trail) features = set([]) url_set = set([]) stars = int(stars) # get all stared urls for the trail for (url, rank) in db.getRankedUrls(org, trail, domain): url_set.add(url) if stars < 1: urls = db.getBrowsePathUrls(org, trail) for url in urls: url_set.add(url) # get the list of invalid entities for the domain markedEntities = set([]) for (type, value) in db.get_marked_entities_for_domain(org, domain): markedEntities.add(value) # for each url get all extracted entities entity_data_connector = factory.get_entity_data_connector() all_entities = entity_data_connector.get_extracted_entities_from_urls( url_set) for url, featureDict in all_entities.iteritems(): for type, values in featureDict.iteritems(): type = type.replace(',', ' ') filtered_values = [] for value in values: if value not in markedEntities: value = value.replace(',', ' ') features.add(type + "\0" + value) # for each url get any manually extracted entities for url in url_set: for featureObj in db.get_feedback_entities(org, domain, url): type = featureObj['type'].replace(',', ' ') value = featureObj['value'].replace(',', ' ') features.add(type + "\0" + value) entity_data_connector.add_new_domain_items( map(lambda x: newdomain + '\0' + x, features))
def loadDomain(name, description, filename): print "\nloading domain: ", name, "\ndescription: ", description, "\nfrom path: ", filename, "\n" # if db.domain_exists(name): # raise ValueError("Domain already exists: "+name) fobj = open(filename, "r") domain_content_connector = factory.get_entity_data_connector() try: cnt = 0 items = [] for row in fobj: row = row.strip() row = row.replace("\0", "") cnt = cnt + 1 i = row.index(",") attr = row[:i] value = row[i + 1 :] if attr[0] == '"' and attr[len(type) - 1] == '"': attr = attr[1:-1] if value[0] == '"' and value[len(value) - 1] == '"': value = value[1:-1] items.append(name + "\0" + attr + "\0" + value) if cnt % 1000 == 0: domain_content_connector.add_new_domain_items(items) items = [] if cnt % 1000 == 0: print "added: ", cnt if len(items) > 0: domain_content_connector.add_new_domain_items(items) print "added new items cnt=", cnt db.add_new_domain(name, description) finally: fobj.close() domain_content_connector.close()
def upload_file(team_id, name, description, features): # tangelo.log("Loading new domain: ") # tangelo.log(name) # tangelo.log(description) # tangelo.log(features) domain_id = db.add_new_domain(team_id, name, description) domain_content_connector = factory.get_entity_data_connector() try: domain_content_connector.add_new_domain_items(domain_id, features) except: db.remove_domain(domain_id) raise finally: domain_content_connector.close() newdomain = dict(id=domain_id, name=name, description=description) # tangelo.log("loaded new domain") # tangelo.log(newdomain) return json.dumps(newdomain)
def upload_file(file_upload, name, description): tangelo.log("Loading new domain: "+name) domain_content_connector = factory.get_entity_data_connector() try: if not db.domain_exists(name): if file_upload is not None: domain_file_lines = file_upload.file.readlines() domain_file_lines = map(lambda x: x.strip().replace('\0', ''), domain_file_lines) db.add_new_domain(name, description) rowkeys = [] for line in domain_file_lines: i = line.index(',') # split on the first comma type = line[:i] value = line[i + 1:] if type[0] == '"' and type[len(type) - 1] == '"': type = type[1:-1] if value[0] == '"' and value[len(value) - 1] == '"': value = value[1:-1] rowkeys.append("%s\0%s\0%s" % (name, type, value)) result = domain_content_connector.add_new_domain_items(rowkeys) return json.dumps(dict(success=result)) return json.dumps(dict(success=False)) return json.dumps(dict(success=False)) finally: domain_content_connector.close()
def upload_file(team_id, name, description, features): #tangelo.log("Loading new domain: ") #tangelo.log(name) #tangelo.log(description) #tangelo.log(features) domain_id = db.add_new_domain(team_id, name, description) domain_content_connector = factory.get_entity_data_connector() try: domain_content_connector.add_new_domain_items(domain_id, features) except: db.remove_domain(domain_id) raise finally: domain_content_connector.close() newdomain = dict(id=domain_id, name=name, description=description) #tangelo.log("loaded new domain") #tangelo.log(newdomain) return json.dumps(newdomain)
def loadDomain(name, description, filename): print '\nloading domain: ', name, '\ndescription: ', description, '\nfrom path: ', filename, '\n' #if db.domain_exists(name): # raise ValueError("Domain already exists: "+name) fobj = open(filename, 'r') domain_content_connector = factory.get_entity_data_connector() try: cnt = 0 items = [] for row in fobj: row = row.strip() row = row.replace("\0", '') cnt = cnt + 1 i = row.index(',') attr = row[:i] value = row[i + 1:] if attr[0] == '"' and attr[len(type) - 1] == '"': attr = attr[1:-1] if value[0] == '"' and value[len(value) - 1] == '"': value = value[1:-1] items.append(name + '\0' + attr + '\0' + value) if cnt % 1000 == 0: domain_content_connector.add_new_domain_items(items) items = [] if cnt % 1000 == 0: print 'added: ', cnt if len(items) > 0: domain_content_connector.add_new_domain_items(items) print 'added new items cnt=', cnt db.add_new_domain(name, description) finally: fobj.close() domain_content_connector.close()
""" import igraph import tangelo import time import datawake.util.dataconnector.factory as factory import tldextract from datawake.util.db import datawake_mysql from urlparse import urlparse """ Provides all the functionality around building graphs for display on the datawake forensic view. """ entityDataConnector = factory.get_entity_data_connector() def getBrowsePathEdges(trail_id, startdate, enddate, userlist=[]): tangelo.log('getBrowsePathEdges(%s,%s,%s)' % (startdate, enddate, userlist)) rows = datawake_mysql.getVisitedUrlsInTrailForTimeRange( trail_id, startdate, enddate, userlist) edges = [] nodes = {} edge_buffer = [] for row in rows: tangelo.log(row) ts = row['ts']
def get(team_id,domain_id,trail_id,stars,extracted_features): """ :param team_id: id of the team :param domain_id: id of domain :param trail_id: id of the trail :param stars: minimum avg star ranking to allow in report :param extracted_features: y or n (include auto extracted features in report) :return: { trail: {id:..,name:...,description:.., url-count} urls: [ { url: .., url-visit-count: .. rank: {min: ?, max: ?, avg: ?, count: ?}, # rounded to nearest tenth auto_features: { type: [value,..],..}, manual_feature: { type: [value,..],..}, selections: {ts: "2015-02-25 16:24:41", selection: "selected text"} },.. ] } """ trailData = db.getTrailData(trail_id) trail_report = {} # get ranked urls for rankObject in db.getRankedUrls(trail_id): if rankObject['avg'] >= stars: url = rankObject['url'] del rankObject['url'] trail_report[url] = {'url': url, 'rank':rankObject} # get url hit counts and un ranked urls for urlObj in db.getBrowsePathUrls(trail_id): url = urlObj['url'] if url in trail_report or stars < 1: if url not in trail_report: trail_report[url] = {'url': url, 'rank':{'min':0,'max':0,'avg':0,'count':0} } trail_report[url]['url-visit-count'] = urlObj['count'] trailData['url-count'] = len(trail_report.keys()) # get the list of invalid entities for the domain if extracted_features != 'n': markedEntities = set([]) for featureObj in db.get_marked_features(trail_id): key = featureObj['type']+':'+featureObj['value'] markedEntities.add(key) # for each url get all extracted entities entity_data_connector = factory.get_entity_data_connector() all_entities = entity_data_connector.get_extracted_entities_from_urls(trail_report.keys()) for url,featureDict in all_entities.iteritems(): for type,values in featureDict.iteritems(): filtered_values = [] for value in values: key = type+':'+value if key not in markedEntities: filtered_values.append(value) if len(filtered_values) > 0: try: if 'auto_features' not in trail_report[url]: trail_report[url]['auto_features'] = {} trail_report[url]['auto_features'][type] = filtered_values except: tangelo.log("report generation error. skipping url.") continue # for each url get any manually extracted entities for url in trail_report.keys(): for featureObj in db.get_manual_features(trail_id,url): if 'manual_features' not in trail_report[url]: trail_report[url]['manual_features'] = {} if featureObj['type'] not in trail_report[url]['manual_features']: trail_report[url]['manual_features'][featureObj['type']] = [] trail_report[url]['manual_features'][featureObj['type']].append(featureObj['value']) # for each url get any highlighted text for url in trail_report.keys(): selections = db.getSelections(trail_id, url) # lets keep user names out of reports for now if len(selections) > 0: for sel in selections: del sel['userEmail'] trail_report[url]['selections'] = selections result = {'trail': trailData,'urls':trail_report.values()} return json.dumps(result,sort_keys=True,indent=4,separators=(',',':'))
""" import igraph import tangelo import datawake.util.dataconnector.factory as factory from datawake.util.db import datawake_mysql """ Provides all the functionality around building graphs for display on the datawake forensic view. """ entityDataConnector = factory.get_entity_data_connector() def getBrowsePathEdges(org,startdate,enddate,userlist=[],trail='*',domain=''): print 'getBrowsePathEdges(',startdate,',',enddate,',',userlist,')' org = org.upper() command = """SELECT unix_timestamp(t1.ts) as ts, t1.url,hits,userName,userId,id,trail FROM datawake_data as t1 LEFT JOIN (select url,count(url) as hits from datawake_data WHERE org = %s and domain = %s group by url ) as t2 ON t1.url = t2.url WHERE t1.org = %s and t1.domain = %s """ commandArgs = [org,domain,org,domain] # add the time filter to the query if (startdate == '' and enddate == ''): pass elif (startdate != '' and enddate == ''):