示例#1
0
def get_entities(trail_id):
    tangelo.log('Getting entities for trail: %s' % trail_id)
    entities = {}
    entityList = []
    urls = []
    rows = db.getBrowsePathUrls(trail_id)
    for row in rows:
        urls.append(row['url'])

    entity_data_connector = factory.get_entity_data_connector()
    results = entity_data_connector.get_extracted_entities_from_urls(urls)

    tangelo.log('Got entities')

    for result in results:
        for entityType in results[result]:
            for entityName in results[result][entityType]:
                if entityName in entities:
                    entities[entityName]['pages'] = entities[entityName]['pages'] + 1
                else:
                    entities[entityName] = {'type': entityType, 'pages':1}
    # TODO either figure out how how map the data or do this differently
    for entity in entities:
        entityList.append({'name': entity, 'type': entities[entity]['type'], 'pages': entities[entity]['pages']})
    return json.dumps(entityList)
示例#2
0
def get_preview(domain):
    domain_content_connector = factory.get_entity_data_connector()
    try:
        data = domain_content_connector.get_domain_items(domain, 10)
        return json.dumps(data)
    finally:
        domain_content_connector.close()
示例#3
0
def scrape_page(team_id, domain_id, trail_id, url, content, userEmail):

    content = urllib.unquote(content).encode('utf-8')
    url = url.encode('utf-8')

    connector = factory.get_entity_data_connector()
    (features, errors) = extractors.extractAll(content)
    for error in errors:
        tangelo.log("FEATURE EXTRACTION ERROR: " + error)

    for type, values in features.iteritems():
        connector.insert_entities(url, type, values)
        if len(values) > 0:
            features_in_domain = connector.get_domain_entity_matches(
                domain_id, type, values)
            if len(features_in_domain) > 0:
                tangelo.log("INSERTING DOMAIN ENTITIES")
                tangelo.log(type)
                tangelo.log(features_in_domain)
                connector.insert_domain_entities(str(domain_id), url, type,
                                                 features_in_domain)

    id = db.addBrowsePathData(team_id, domain_id, trail_id, url, userEmail)
    count = db.getUrlCount(team_id, domain_id, trail_id, url)
    result = dict(id=id, count=count)
    return json.dumps(result)
示例#4
0
def scrape_page(team_id,domain_id,trail_id,url,content,userEmail):

    content = urllib.unquote(content).encode('utf-8')
    url = url.encode('utf-8')

    connector = factory.get_entity_data_connector()
    (features,errors) = extractors.extractAll(content)
    for error in errors:
        tangelo.log("FEATURE EXTRACTION ERROR: "+error)


    for type,values in features.iteritems():
        connector.insert_entities(url,type,values)
        if len(values) > 0:
            features_in_domain = connector.get_domain_entity_matches(domain_id,type,values)
            if len(features_in_domain) > 0:
                tangelo.log("INSERTING DOMAIN ENTITIES")
                tangelo.log(type)
                tangelo.log(features_in_domain)
                connector.insert_domain_entities(str(domain_id),url, type, features_in_domain)



    id = db.addBrowsePathData(team_id,domain_id,trail_id,url, userEmail)
    count = db.getUrlCount(team_id,domain_id,trail_id, url)
    result = dict(id=id, count=count)
    return json.dumps(result)
示例#5
0
def scrape_page(team_id,domain_id,trail_id,url,content,user_email):

    content = urllib.unquote(content).encode('utf-8')
    url = url.encode('utf-8')

    connector = factory.get_entity_data_connector()

    # blacklist of pages to not extract data from
    blacklist = config.get_extraction_blacklist()
    if urlparse(url).netloc not in blacklist:
        (features,errors) = extractors.extractAll(content)
        for error in errors:
            tangelo.log("FEATURE EXTRACTION ERROR: "+error)

        for type,values in features.iteritems():
            connector.insert_entities(url,type,values)
            if len(values) > 0:
                features_in_domain = connector.get_domain_entity_matches(domain_id,type,values)
                if len(features_in_domain) > 0:
                    tangelo.log("INSERTING DOMAIN ENTITIES")
                    tangelo.log(type)
                    connector.insert_domain_entities(str(domain_id),url, type, features_in_domain)
        # we also don't want to export blacklisted pages.
        tangelo.log("Calling export")
        export_to_services(domain_id, team_id, trail_id, url, content, user_email, features)
    else:
        tangelo.log("Url: %s IN blacklist"%url)

    id = db.addBrowsePathData(team_id,domain_id,trail_id,url, user_email)

    count = db.getUrlCount(team_id,domain_id,trail_id, url)
    result = dict(id=id, count=count)
    return json.dumps(result)
示例#6
0
def scrape_page(html, url, userId, userName, trail, domain, org):
    #tangelo.log('USER NAME: ' + userName)
    domain = domain.encode('utf-8')
    org = org.encode('utf-8')
    html = urllib.unquote(html).encode('utf-8')
    url = url.encode('utf-8')

    connector = factory.get_entity_data_connector()
    (features,errors) = extractors.extractAll(html)
    tangelo.log(features)
    for type,values in features.iteritems():
        connector.insert_entities(url,type,values)
        #for value in values:
        #    tangelo.log("EXTRACTED: "+type+"\t"+value)
        if len(values) > 0:
            features_in_domain = connector.get_domain_entity_matches(domain,type,values)
            if len(features_in_domain) > 0:
                connector.insert_domain_entities(domain,url, type, features_in_domain)
                #tangelo.log("EXTRACTED "+str(len(features_in_domain))+" DOMAIN FEATURES")


    for error in errors:
        tangelo.log("FEATURE EXTRACTION ERROR: "+error)


    id = db.addBrowsePathData(org, url, userId, userName, trail, domain=domain)
    # get number of times this url appears in the database
    count = db.getUrlCount(org, url, domain=domain)
    result = dict(id=id, count=count)



    #tangelo.log("POSTED url:" + url + "  return: " + str(result))
    return json.dumps(result)
示例#7
0
文件: loader.py 项目: diffeo/Datawake
def upload_file(*args, **kwargs):
    domain_content_connector = factory.get_entity_data_connector()
    try:
        domain_file = kwargs.get("file_upload")
        domain_name = kwargs.get("name")
        domain_description = kwargs.get("description")
        if not db.domain_exists(domain_name):
            if domain_file is not None:
                tangelo.log("read domain file")
                domain_file_lines = domain_file.file.readlines()
                domain_file_lines = map(lambda x: x.strip().replace('\0',''), domain_file_lines)
                db.add_new_domain(domain_name, domain_description)
                rowkeys = []
                for line in domain_file_lines:
                    i = line.index(',')   # split on the first comma
                    type = line[:i]
                    value = line[i+1:]
                    if type[0] == '"' and type[len(type)-1] == '"': type = type[1:-1]
                    if value[0] == '"' and value[len(value)-1] == '"': value = value[1:-1]
                    rowkeys.append( domain_name+'\0'+type+'\0'+value )
                result = domain_content_connector.add_new_domain_items(rowkeys)
                return json.dumps(dict(success=result))
            else:
                return json.dumps(dict(success=False))
        else:
            return json.dumps(dict(success=False))
    finally:
        domain_content_connector.close()
示例#8
0
def delete_domain(domain_name):
    if db.domain_exists(domain_name):
        domain_content_connector = factory.get_entity_data_connector()
        db.remove_domain(domain_name)
        domain_content_connector.delete_domain_items(domain_name)
        return json.dumps(dict(success=True))
    return json.dumps(dict(success=False))
示例#9
0
def upload_database_threaded(domain_name, connection_string, domain_description, table_name, attribute_column, value_column):
    domain_content_connector = factory.get_entity_data_connector()
    connector = ConnectorUtil.get_database_connector(connection_string, table_name, attribute_column, value_column)
    rows = connector.get_domain_items()
    success = domain_content_connector.add_new_domain_items(map(lambda items: "%s\0%s\0%s" % (domain_name, items[0], items[1]), rows))
    complete_dict = dict(domain=domain_name, description=domain_description, success=success, complete=True)
    completed_threads.put(complete_dict)
示例#10
0
def delete_domain(team_id, domain_id):
    tangelo.log("deleting domain: " + str(domain_id))
    domain_content_connector = factory.get_entity_data_connector()
    domain_content_connector.delete_domain_items(domain_id)
    tangelo.log("delted features")
    db.remove_domain(domain_id)
    tangelo.log("delted meta data")
    return json.dumps(dict(success=True))
示例#11
0
def delete_domain(team_id, domain_id):
    tangelo.log("deleting domain: " + str(domain_id))
    domain_content_connector = factory.get_entity_data_connector()
    domain_content_connector.delete_domain_items(domain_id)
    tangelo.log("delted features")
    db.remove_domain(domain_id)
    tangelo.log("delted meta data")
    return json.dumps(dict(success=True))
示例#12
0
文件: loader.py 项目: diffeo/Datawake
def get_preview(*args, **kwargs):
    domain_content_connector = factory.get_entity_data_connector()
    try:
        name = kwargs.get("domain")
        data = domain_content_connector.get_domain_items(name, 10)
        return json.dumps(data)
    finally:
        domain_content_connector.close()
示例#13
0
def get_all_entities(url):
    entity_data_connector = None
    try:
        entity_data_connector = factory.get_entity_data_connector()
        all_entities = entity_data_connector.get_extracted_entities_from_urls([url])
        entities = all_entities.get(url, {})
        return json.dumps(entities)
    finally:
        entity_data_connector.close()
def get_domain_extracted_entities(url, domain):
    entity_data_connector = None
    try:
        entity_data_connector = factory.get_entity_data_connector()
        domain_entities = entity_data_connector.get_extracted_domain_entities_from_urls(domain, [url])
        return json.dumps(dict(domainExtracted=domain_entities.get(url)))
    finally:
        if entity_data_connector is not None:
            entity_data_connector.close()
示例#15
0
def get_all_entities(url):
    entity_data_connector = None
    try:
        entity_data_connector = factory.get_entity_data_connector()
        all_entities = entity_data_connector.get_extracted_entities_from_urls(
            [url])
        entities = all_entities.get(url, {})
        return json.dumps(entities)
    finally:
        entity_data_connector.close()
示例#16
0
def get_domain_extracted_entities(url, domain_id, team_id):
    entity_data_connector = None
    try:
        entity_data_connector = factory.get_entity_data_connector()
        domain_entities = entity_data_connector.get_extracted_domain_entities_from_urls(
            domain_id, [url])
        return json.dumps(domain_entities.get(url))
    finally:
        if entity_data_connector is not None:
            entity_data_connector.close()
def get_all_entities(url, domain):
    entity_data_connector = None
    try:
        entity_data_connector = factory.get_entity_data_connector()
        all_entities = entity_data_connector.get_extracted_entities_from_urls([url])
        domain_extracted = entity_data_connector.get_extracted_domain_entities_from_urls(domain, [url])
        entities = dict(domainExtracted=domain_extracted.get(url, {}), allEntities=all_entities.get(url, {}))
        return json.dumps(entities)

    finally:
        entity_data_connector.close()
示例#18
0
def get(team_id,domain_id,limit):

    """
    Verify the logged in user has access to the requested team and return N items from the domain
    :param team_id:
    :return: List of domain features
            [{type:.., value:...},..]
    """
    entity_data_connector = factory.get_entity_data_connector()
    results = entity_data_connector.get_domain_items(domain_id,limit)
    return json.dumps(results)
示例#19
0
文件: loader.py 项目: diffeo/Datawake
def delete_domain(*args, **kwargs):
    domain_name = kwargs.get("domain_name")
    for key in kwargs.keys():
        tangelo.log(key)
    if db.domain_exists(domain_name):
        domain_content_connector = factory.get_entity_data_connector()
        db.remove_domain(domain_name)
        domain_content_connector.delete_domain_items(domain_name)
        return json.dumps(dict(success=True))
    else:
        return json.dumps(dict(success=False))
示例#20
0
def get(domain,trail,stars,newdomain):
    org = helper.get_org().upper()

    if not db.domain_exists(newdomain):
        db.add_new_domain(newdomain,'auto generated domain from trail: '+trail)

    features = set([])
    url_set = set([])
    stars = int(stars)
    # get all stared urls for the trail


    for (url,rank) in db.getRankedUrls(org,trail,domain):
        url_set.add(url)

    if stars < 1:
        urls = db.getBrowsePathUrls(org,trail)
        for url in urls:
           url_set.add(url)


    # get the list of invalid entities for the domain
    markedEntities = set([])
    for (type,value) in db.get_marked_entities_for_domain(org, domain):
        markedEntities.add(value)


    # for each url get all extracted entities
    entity_data_connector = factory.get_entity_data_connector()
    all_entities = entity_data_connector.get_extracted_entities_from_urls(url_set)
    for url,featureDict in all_entities.iteritems():
        for type,values in featureDict.iteritems():
            type = type.replace(',',' ')
            filtered_values = []
            for value in values:
                if value not in markedEntities:
                    value = value.replace(',',' ')
                    features.add(type+"\0"+value)



    # for each url get any manually extracted entities
    for url in url_set:
        for featureObj in db.get_feedback_entities(org, domain, url):
            type = featureObj['type'].replace(',',' ')
            value = featureObj['value'].replace(',',' ')
            features.add(type+"\0"+value)





    entity_data_connector.add_new_domain_items( map(lambda x: newdomain+'\0'+x,features))
示例#21
0
def get_lookahead(url, srcurl, domain):
    entity_data_connector = None
    try:
        entity_data_connector = factory.get_entity_data_connector()
        # get the features from the lookahead url that are also on the src url
        matches = entity_data_connector.get_matching_entities_from_url([url, srcurl])
        domain_matches = entity_data_connector.get_extracted_domain_entities_for_urls(domain, [url])
        result = dict(url=url, matches=matches, domain_search_matches=domain_matches)
        return json.dumps(result)
    finally:
        if entity_data_connector is not None:
            entity_data_connector.close()
示例#22
0
def get(domain, trail):
    org = helper.get_org()

    trail_report = {}

    # get all stared urls for the trail
    for (url, rank) in db.getRankedUrls(org, trail, domain):
        trail_report[url] = {
            'rank': rank,
        }

    # get the list of invalid entities for the domain
    markedEntities = set([])
    for (type, value) in db.get_marked_entities_for_domain(org, domain):
        markedEntities.add(value)

    # for each url get all extracted entities
    entity_data_connector = factory.get_entity_data_connector()
    all_entities = entity_data_connector.get_extracted_entities_from_urls(
        trail_report.keys())
    for url, featureDict in all_entities.iteritems():
        for type, values in featureDict.iteritems():
            filtered_values = []
            for value in values:
                if value not in markedEntities:
                    filtered_values.append(value)
            if len(filtered_values) > 0:
                try:
                    if 'auto_features' not in trail_report[url]:
                        trail_report[url]['auto_features'] = {}
                    trail_report[url]['auto_features'][type] = filtered_values
                except:
                    tangelo.log("report generation error. skipping url.")
                    continue

    # for each url get any manually extracted entities
    for url in trail_report.keys():
        for featureObj in db.get_feedback_entities(org, domain, url):
            if 'manual_features' not in trail_report[url]:
                trail_report[url]['manual_features'] = {}
            if featureObj['type'] not in trail_report[url]['manual_features']:
                trail_report[url]['manual_features'][featureObj['type']] = []
            trail_report[url]['manual_features'][featureObj['type']].append(
                featureObj['value'])

    # for each url get any highlighted text
    for url in trail_report.keys():
        selections = db.getSelections(domain, trail, url, org)
        if len(selections) > 0:
            trail_report[url]['selections'] = selections

    result = {'trail': trail, 'urls': trail_report}
    return json.dumps(result, sort_keys=True, indent=4, separators=(',', ':'))
示例#23
0
def get_all_entities(url, domain):
    entity_data_connector = None
    try:
        entity_data_connector = factory.get_entity_data_connector()
        all_entities = entity_data_connector.get_extracted_entities_from_urls(
            [url])
        domain_extracted = entity_data_connector.get_extracted_domain_entities_from_urls(
            domain, [url])
        entities = dict(domainExtracted=domain_extracted.get(url, {}),
                        allEntities=all_entities.get(url, {}))
        return json.dumps(entities)

    finally:
        entity_data_connector.close()
示例#24
0
def get_lookahead(url, srcurl, domain):
    entity_data_connector = None
    try:
        entity_data_connector = factory.get_entity_data_connector()
        # get the features from the lookahead url that are also on the src url
        matches = entity_data_connector.get_matching_entities_from_url(
            [url, srcurl])
        domain_matches = entity_data_connector.get_extracted_domain_entities_for_urls(
            domain, [url])
        result = dict(url=url,
                      matches=matches,
                      domain_search_matches=domain_matches)
        return json.dumps(result)
    finally:
        if entity_data_connector is not None:
            entity_data_connector.close()
示例#25
0
def get(domain, trail, stars, newdomain):
    org = helper.get_org().upper()

    if not db.domain_exists(newdomain):
        db.add_new_domain(newdomain,
                          'auto generated domain from trail: ' + trail)

    features = set([])
    url_set = set([])
    stars = int(stars)
    # get all stared urls for the trail

    for (url, rank) in db.getRankedUrls(org, trail, domain):
        url_set.add(url)

    if stars < 1:
        urls = db.getBrowsePathUrls(org, trail)
        for url in urls:
            url_set.add(url)

    # get the list of invalid entities for the domain
    markedEntities = set([])
    for (type, value) in db.get_marked_entities_for_domain(org, domain):
        markedEntities.add(value)

    # for each url get all extracted entities
    entity_data_connector = factory.get_entity_data_connector()
    all_entities = entity_data_connector.get_extracted_entities_from_urls(
        url_set)
    for url, featureDict in all_entities.iteritems():
        for type, values in featureDict.iteritems():
            type = type.replace(',', ' ')
            filtered_values = []
            for value in values:
                if value not in markedEntities:
                    value = value.replace(',', ' ')
                    features.add(type + "\0" + value)

    # for each url get any manually extracted entities
    for url in url_set:
        for featureObj in db.get_feedback_entities(org, domain, url):
            type = featureObj['type'].replace(',', ' ')
            value = featureObj['value'].replace(',', ' ')
            features.add(type + "\0" + value)

    entity_data_connector.add_new_domain_items(
        map(lambda x: newdomain + '\0' + x, features))
示例#26
0
def loadDomain(name, description, filename):
    print "\nloading domain: ", name, "\ndescription: ", description, "\nfrom path: ", filename, "\n"

    # if db.domain_exists(name):
    #    raise ValueError("Domain already exists: "+name)

    fobj = open(filename, "r")

    domain_content_connector = factory.get_entity_data_connector()
    try:
        cnt = 0
        items = []
        for row in fobj:
            row = row.strip()
            row = row.replace("\0", "")
            cnt = cnt + 1
            i = row.index(",")
            attr = row[:i]
            value = row[i + 1 :]
            if attr[0] == '"' and attr[len(type) - 1] == '"':
                attr = attr[1:-1]
            if value[0] == '"' and value[len(value) - 1] == '"':
                value = value[1:-1]
            items.append(name + "\0" + attr + "\0" + value)
            if cnt % 1000 == 0:
                domain_content_connector.add_new_domain_items(items)
                items = []
            if cnt % 1000 == 0:
                print "added: ", cnt
        if len(items) > 0:
            domain_content_connector.add_new_domain_items(items)

        print "added new items cnt=", cnt

        db.add_new_domain(name, description)
    finally:
        fobj.close()
        domain_content_connector.close()
示例#27
0
def upload_file(team_id, name, description, features):

    # tangelo.log("Loading new domain: ")
    # tangelo.log(name)
    # tangelo.log(description)
    # tangelo.log(features)

    domain_id = db.add_new_domain(team_id, name, description)

    domain_content_connector = factory.get_entity_data_connector()

    try:
        domain_content_connector.add_new_domain_items(domain_id, features)
    except:
        db.remove_domain(domain_id)
        raise
    finally:
        domain_content_connector.close()

    newdomain = dict(id=domain_id, name=name, description=description)
    # tangelo.log("loaded new domain")
    # tangelo.log(newdomain)
    return json.dumps(newdomain)
示例#28
0
def upload_file(file_upload, name, description):
    tangelo.log("Loading new domain: "+name)
    domain_content_connector = factory.get_entity_data_connector()
    try:
        if not db.domain_exists(name):
            if file_upload is not None:
                domain_file_lines = file_upload.file.readlines()
                domain_file_lines = map(lambda x: x.strip().replace('\0', ''), domain_file_lines)
                db.add_new_domain(name, description)
                rowkeys = []
                for line in domain_file_lines:
                    i = line.index(',')  # split on the first comma
                    type = line[:i]
                    value = line[i + 1:]
                    if type[0] == '"' and type[len(type) - 1] == '"': type = type[1:-1]
                    if value[0] == '"' and value[len(value) - 1] == '"': value = value[1:-1]
                    rowkeys.append("%s\0%s\0%s" % (name, type, value))
                result = domain_content_connector.add_new_domain_items(rowkeys)
                return json.dumps(dict(success=result))
            return json.dumps(dict(success=False))
        return json.dumps(dict(success=False))
    finally:
        domain_content_connector.close()
示例#29
0
def upload_file(team_id, name, description, features):

    #tangelo.log("Loading new domain: ")
    #tangelo.log(name)
    #tangelo.log(description)
    #tangelo.log(features)

    domain_id = db.add_new_domain(team_id, name, description)

    domain_content_connector = factory.get_entity_data_connector()

    try:
        domain_content_connector.add_new_domain_items(domain_id, features)
    except:
        db.remove_domain(domain_id)
        raise
    finally:
        domain_content_connector.close()

    newdomain = dict(id=domain_id, name=name, description=description)
    #tangelo.log("loaded new domain")
    #tangelo.log(newdomain)
    return json.dumps(newdomain)
示例#30
0
def loadDomain(name, description, filename):
    print '\nloading domain: ', name, '\ndescription: ', description, '\nfrom path: ', filename, '\n'

    #if db.domain_exists(name):
    #    raise ValueError("Domain already exists: "+name)

    fobj = open(filename, 'r')

    domain_content_connector = factory.get_entity_data_connector()
    try:
        cnt = 0
        items = []
        for row in fobj:
            row = row.strip()
            row = row.replace("\0", '')
            cnt = cnt + 1
            i = row.index(',')
            attr = row[:i]
            value = row[i + 1:]
            if attr[0] == '"' and attr[len(type) - 1] == '"': attr = attr[1:-1]
            if value[0] == '"' and value[len(value) - 1] == '"':
                value = value[1:-1]
            items.append(name + '\0' + attr + '\0' + value)
            if cnt % 1000 == 0:
                domain_content_connector.add_new_domain_items(items)
                items = []
            if cnt % 1000 == 0:
                print 'added: ', cnt
        if len(items) > 0:
            domain_content_connector.add_new_domain_items(items)

        print 'added new items cnt=', cnt

        db.add_new_domain(name, description)
    finally:
        fobj.close()
        domain_content_connector.close()
示例#31
0
"""
import igraph
import tangelo
import time
import datawake.util.dataconnector.factory as factory
import tldextract
from datawake.util.db import datawake_mysql
from urlparse import urlparse
"""

Provides all the functionality around building graphs for display on the datawake
forensic view.

"""

entityDataConnector = factory.get_entity_data_connector()


def getBrowsePathEdges(trail_id, startdate, enddate, userlist=[]):
    tangelo.log('getBrowsePathEdges(%s,%s,%s)' %
                (startdate, enddate, userlist))

    rows = datawake_mysql.getVisitedUrlsInTrailForTimeRange(
        trail_id, startdate, enddate, userlist)

    edges = []
    nodes = {}
    edge_buffer = []
    for row in rows:
        tangelo.log(row)
        ts = row['ts']
示例#32
0
def get(team_id,domain_id,trail_id,stars,extracted_features):

    """

    :param team_id:  id of the team
    :param domain_id: id of domain
    :param trail_id: id of the trail
    :param stars: minimum avg star ranking to allow in report
    :param extracted_features: y or n (include auto extracted features in report)
    :return:
        {
            trail: {id:..,name:...,description:.., url-count}
            urls: [
                {
                    url: ..,
                    url-visit-count: ..
                    rank: {min: ?, max: ?, avg: ?, count: ?}, # rounded to nearest tenth
                    auto_features: { type: [value,..],..},
                    manual_feature: { type: [value,..],..},
                    selections: {ts: "2015-02-25 16:24:41", selection: "selected text"}
                },..
            ]
        }

    """


    trailData = db.getTrailData(trail_id)
    trail_report = {}


    # get ranked urls
    for rankObject in db.getRankedUrls(trail_id):
        if  rankObject['avg'] >= stars:
            url = rankObject['url']
            del rankObject['url']
            trail_report[url] = {'url': url, 'rank':rankObject}

    # get url hit counts and un ranked urls
    for urlObj in db.getBrowsePathUrls(trail_id):
        url = urlObj['url']
        if url in trail_report or stars < 1:
            if url not in trail_report:
                trail_report[url] = {'url': url, 'rank':{'min':0,'max':0,'avg':0,'count':0} }
            trail_report[url]['url-visit-count'] = urlObj['count']



    trailData['url-count'] = len(trail_report.keys())

    # get the list of invalid entities for the domain
    if extracted_features != 'n':
        markedEntities = set([])
        for featureObj in db.get_marked_features(trail_id):
            key = featureObj['type']+':'+featureObj['value']
            markedEntities.add(key)


        # for each url get all extracted entities
        entity_data_connector = factory.get_entity_data_connector()
        all_entities = entity_data_connector.get_extracted_entities_from_urls(trail_report.keys())
        for url,featureDict in all_entities.iteritems():
            for type,values in featureDict.iteritems():
                filtered_values = []
                for value in values:
                    key = type+':'+value
                    if key not in markedEntities:
                        filtered_values.append(value)
                if len(filtered_values) > 0:
                    try:
                        if 'auto_features' not in trail_report[url]: trail_report[url]['auto_features'] = {}
                        trail_report[url]['auto_features'][type] = filtered_values
                    except:
                        tangelo.log("report generation error. skipping url.")
                        continue


    # for each url get any manually extracted entities
    for url in trail_report.keys():
        for featureObj in db.get_manual_features(trail_id,url):
            if 'manual_features' not in trail_report[url]:
                trail_report[url]['manual_features'] = {}
            if featureObj['type'] not in trail_report[url]['manual_features']:
                trail_report[url]['manual_features'][featureObj['type']] = []
            trail_report[url]['manual_features'][featureObj['type']].append(featureObj['value'])


    # for each url get any highlighted text
    for url in trail_report.keys():
        selections = db.getSelections(trail_id, url)

        # lets keep user names out of reports for now
        if len(selections) > 0:
            for sel in selections:
                del sel['userEmail']
            trail_report[url]['selections'] = selections


    result = {'trail': trailData,'urls':trail_report.values()}
    return json.dumps(result,sort_keys=True,indent=4,separators=(',',':'))
示例#33
0
文件: helper.py 项目: diffeo/Datawake
"""
import igraph
import tangelo

import datawake.util.dataconnector.factory as factory
from datawake.util.db import datawake_mysql


"""

Provides all the functionality around building graphs for display on the datawake
forensic view.

"""

entityDataConnector = factory.get_entity_data_connector()


def getBrowsePathEdges(org,startdate,enddate,userlist=[],trail='*',domain=''):
    print 'getBrowsePathEdges(',startdate,',',enddate,',',userlist,')'
    org = org.upper()
    command = """SELECT unix_timestamp(t1.ts) as ts, t1.url,hits,userName,userId,id,trail
                 FROM datawake_data as t1 LEFT JOIN (select url,count(url) as hits from datawake_data WHERE org = %s and domain = %s group by url ) as t2 ON t1.url = t2.url
                 WHERE t1.org = %s and t1.domain = %s
              """
    commandArgs = [org,domain,org,domain]

    # add the time filter to the query
    if (startdate == '' and enddate == ''):
        pass
    elif (startdate != '' and enddate == ''):