示例#1
0
def invalid_extraction(entity_type, entity_value, domain):
    user = session_helper.get_user()
    user_name = user.get_user_name()
    org = session_helper.get_org()
    success = db.mark_invalid_extracted_entity(user_name, entity_type,
                                               entity_value, domain, org) == 0
    return json.dumps(dict(success=success))
示例#2
0
def add_irrelevant_trail_entity(domain, trail, entity):
    org = helper.get_org()
    if not db.does_irrelevant_entity_exist(org, domain, trail, entity):
        success = db.add_irrelevant_trail_entity(org, domain, trail, entity.encode("utf-8")) == 0
        if success:
            kafka_producer.send_trail_term_message(org, domain, trail, entity, False)
        return json.dumps(dict(success=success))
    return json.dumps(dict(success=True))
def add_irrelevant_trail_entity(domain, trail, entity):
    org = helper.get_org()
    if not db.does_irrelevant_entity_exist(org, domain, trail, entity):
        success = db.add_irrelevant_trail_entity(org, domain, trail, entity.encode("utf-8")) == 0
        if success:
            kafka_producer.send_trail_term_message(org, domain, trail, entity, False)
        return json.dumps(dict(success=success))
    return json.dumps(dict(success=True))
def get_trail_entity_links(domain, trail):
    org = helper.get_org()
    links = json.dumps(
        dict(
            visited=db.get_visited_trail_entity_links(org, domain, trail),
            notVisited=db.get_not_visited_trail_entity_links(org, domain, trail),
        )
    )
    return links
示例#5
0
def getTimeWindow(users, trail=u'*'):
    org = helper.get_org()
    if trail == u'':
        trail = u'*'
    print 'getTimeWindow(', users, ',', trail, ')'
    if len(users) > 0:
        users = users.split(",")
    else:
        users = []
    return json.dumps(datawake_mysql.getTimeWindow(org, users, trail))
示例#6
0
def getTimeWindow(users, trail=u'*'):
    org = helper.get_org()
    if trail == u'':
        trail = u'*'
    print 'getTimeWindow(', users, ',', trail, ')'
    if len(users) > 0:
        users = users.split(",")
    else:
        users = []
    return json.dumps(datawake_mysql.getTimeWindow(org, users, trail))
示例#7
0
def get(domain,trail,stars,newdomain):
    org = helper.get_org().upper()

    if not db.domain_exists(newdomain):
        db.add_new_domain(newdomain,'auto generated domain from trail: '+trail)

    features = set([])
    url_set = set([])
    stars = int(stars)
    # get all stared urls for the trail


    for (url,rank) in db.getRankedUrls(org,trail,domain):
        url_set.add(url)

    if stars < 1:
        urls = db.getBrowsePathUrls(org,trail)
        for url in urls:
           url_set.add(url)


    # get the list of invalid entities for the domain
    markedEntities = set([])
    for (type,value) in db.get_marked_entities_for_domain(org, domain):
        markedEntities.add(value)


    # for each url get all extracted entities
    entity_data_connector = factory.get_entity_data_connector()
    all_entities = entity_data_connector.get_extracted_entities_from_urls(url_set)
    for url,featureDict in all_entities.iteritems():
        for type,values in featureDict.iteritems():
            type = type.replace(',',' ')
            filtered_values = []
            for value in values:
                if value not in markedEntities:
                    value = value.replace(',',' ')
                    features.add(type+"\0"+value)



    # for each url get any manually extracted entities
    for url in url_set:
        for featureObj in db.get_feedback_entities(org, domain, url):
            type = featureObj['type'].replace(',',' ')
            value = featureObj['value'].replace(',',' ')
            features.add(type+"\0"+value)





    entity_data_connector.add_new_domain_items( map(lambda x: newdomain+'\0'+x,features))
示例#8
0
def get(domain, trail):
    org = helper.get_org()

    trail_report = {}

    # get all stared urls for the trail
    for (url, rank) in db.getRankedUrls(org, trail, domain):
        trail_report[url] = {
            'rank': rank,
        }

    # get the list of invalid entities for the domain
    markedEntities = set([])
    for (type, value) in db.get_marked_entities_for_domain(org, domain):
        markedEntities.add(value)

    # for each url get all extracted entities
    entity_data_connector = factory.get_entity_data_connector()
    all_entities = entity_data_connector.get_extracted_entities_from_urls(
        trail_report.keys())
    for url, featureDict in all_entities.iteritems():
        for type, values in featureDict.iteritems():
            filtered_values = []
            for value in values:
                if value not in markedEntities:
                    filtered_values.append(value)
            if len(filtered_values) > 0:
                try:
                    if 'auto_features' not in trail_report[url]:
                        trail_report[url]['auto_features'] = {}
                    trail_report[url]['auto_features'][type] = filtered_values
                except:
                    tangelo.log("report generation error. skipping url.")
                    continue

    # for each url get any manually extracted entities
    for url in trail_report.keys():
        for featureObj in db.get_feedback_entities(org, domain, url):
            if 'manual_features' not in trail_report[url]:
                trail_report[url]['manual_features'] = {}
            if featureObj['type'] not in trail_report[url]['manual_features']:
                trail_report[url]['manual_features'][featureObj['type']] = []
            trail_report[url]['manual_features'][featureObj['type']].append(
                featureObj['value'])

    # for each url get any highlighted text
    for url in trail_report.keys():
        selections = db.getSelections(domain, trail, url, org)
        if len(selections) > 0:
            trail_report[url]['selections'] = selections

    result = {'trail': trail, 'urls': trail_report}
    return json.dumps(result, sort_keys=True, indent=4, separators=(',', ':'))
def get_chart(users=u"", trail=u"*", domain=u""):
    org = helper.get_org()
    # tangelo.log('dataservice-get org=' + org + ' users=' + users + ' trail= ' + trail + ' domain=' + domain)
    if trail == u"":
        trail = u"*"
    if len(users) > 0:
        users = users.split(",")
    else:
        users = []

    result = datawake_mysql.getHourlyBrowsePathCounts(org, users, trail, domain=domain)
    return json.dumps(dict(data=result))
示例#10
0
def get_chart(users=u'', trail=u'*', domain=u''):
    org = helper.get_org()
    #tangelo.log('dataservice-get org=' + org + ' users=' + users + ' trail= ' + trail + ' domain=' + domain)
    if trail == u'':
        trail = u'*'
    if len(users) > 0:
        users = users.split(",")
    else:
        users = []

    result = datawake_mysql.getHourlyBrowsePathCounts(org, users, trail, domain=domain)
    return json.dumps(dict(data=result))
示例#11
0
def get(domain, trail, stars, newdomain):
    org = helper.get_org().upper()

    if not db.domain_exists(newdomain):
        db.add_new_domain(newdomain,
                          'auto generated domain from trail: ' + trail)

    features = set([])
    url_set = set([])
    stars = int(stars)
    # get all stared urls for the trail

    for (url, rank) in db.getRankedUrls(org, trail, domain):
        url_set.add(url)

    if stars < 1:
        urls = db.getBrowsePathUrls(org, trail)
        for url in urls:
            url_set.add(url)

    # get the list of invalid entities for the domain
    markedEntities = set([])
    for (type, value) in db.get_marked_entities_for_domain(org, domain):
        markedEntities.add(value)

    # for each url get all extracted entities
    entity_data_connector = factory.get_entity_data_connector()
    all_entities = entity_data_connector.get_extracted_entities_from_urls(
        url_set)
    for url, featureDict in all_entities.iteritems():
        for type, values in featureDict.iteritems():
            type = type.replace(',', ' ')
            filtered_values = []
            for value in values:
                if value not in markedEntities:
                    value = value.replace(',', ' ')
                    features.add(type + "\0" + value)

    # for each url get any manually extracted entities
    for url in url_set:
        for featureObj in db.get_feedback_entities(org, domain, url):
            type = featureObj['type'].replace(',', ' ')
            value = featureObj['value'].replace(',', ' ')
            features.add(type + "\0" + value)

    entity_data_connector.add_new_domain_items(
        map(lambda x: newdomain + '\0' + x, features))
示例#12
0
def getGraph(name, startdate=u'', enddate=u'', users=u'', trail=u'*', domain=u''):
    org = helper.get_org()
    if trail == u'':
        trail = u'*'
    userlist = map(lambda x: x.replace('\"', '').strip(), users.split(','))
    userlist = filter(lambda x: len(x) > 0, userlist)
    #tangelo.log('getGraph( ' + str(name) + ',' + str(startdate) + ',' + str(enddate) + ',' + str(userlist) + ',' + str(trail) + ',' + str(domain) + ')')

    if name == 'browse path':
        graph = graph_helper.getBrowsePathEdges(org, startdate, enddate, userlist, trail, domain)
        return json.dumps(graph_helper.processEdges(graph['edges'], graph['nodes']))

    if name == 'browse path - with adjacent urls':
        graph = graph_helper.getBrowsePathAndAdjacentWebsiteEdgesWithLimit(org, startdate, enddate, 1, userlist, trail, domain)
        return json.dumps(graph_helper.processEdges(graph['edges'], graph['nodes']))

    if name == 'browse path - with adjacent urls min degree 2':
        graph = graph_helper.getBrowsePathAndAdjacentWebsiteEdgesWithLimit(org, startdate, enddate, 2, userlist, trail, domain)
        return json.dumps(graph_helper.processEdges(graph['edges'], graph['nodes']))

    if name == 'browse path - with adjacent phone #\'s':
        graph = graph_helper.getBrowsePathAndAdjacentPhoneEdgesWithLimit(org, startdate, enddate, 1, userlist, trail, domain)
        return json.dumps(graph_helper.processEdges(graph['edges'], graph['nodes']))

    if name == 'browse path - with adjacent email #\'s':
        graph = graph_helper.getBrowsePathAndAdjacentEmailEdgesWithLimit(org, startdate, enddate, 1, userlist, trail, domain)
        return json.dumps(graph_helper.processEdges(graph['edges'], graph['nodes']))

    if name == 'browse path - with text selections':
        graph = graph_helper.getBrowsePathWithTextSelections(org, startdate, enddate, userlist, trail, domain)
        return json.dumps(graph_helper.processEdges(graph['edges'], graph['nodes']))

    if name == 'browse path- with look ahead':
        graph = graph_helper.getBrowsePathWithLookAhead(org, startdate, enddate, userlist, trail, domain)
        return json.dumps(graph_helper.processEdges(graph['edges'], graph['nodes']))

    if name == 'browse path - with adjacent info':
        graph = graph_helper.getBrowsePathAndAdjacentInfoEdges(org, startdate, enddate,1,userlist, trail, domain)
        return json.dumps(graph_helper.processEdges(graph['edges'], graph['nodes']))

    if name == 'OculusForensicRequest':
        rows = graph_helper.getOculusForensicGraph(org,startdate,enddate,userlist,trail,domain)
        return json.dumps(rows)

    return json.dumps(dict(nodes=[], links=[]))
示例#13
0
def get_trails(domain):
    org = helper.get_org()
    return get_trails_for_domain_and_org(org, domain)
示例#14
0
def get_trail_based_entities(domain, trail):
    entities = db.get_trail_based_entities(helper.get_org(), domain, trail)
    irrelevantEntities = db.get_irrelevant_trail_based_entities(helper.get_org(), domain, trail)
    return json.dumps(dict(entities=entities, irrelevantEntities=irrelevantEntities))
示例#15
0
def get_trails(domain):
    org = helper.get_org()
    return get_trails_for_domain_and_org(org, domain)
示例#16
0
def fetch_entities(domain, url):
    org = session_helper.get_org()
    entities = db.get_feedback_entities(org, domain, url)
    return json.dumps(dict(entities=entities))
示例#17
0
def marked_entities(domain):
    user = session_helper.get_user()
    user_name = user.get_user_name()
    org = session_helper.get_org()
    marked_entities_list = db.get_marked_entities(org, domain, user_name)
    return json.dumps(dict(marked_entities=marked_entities_list))
示例#18
0
def get_url_entities(domain, trail, url):
    results = json.dumps(dict(entities=db.get_entities_on_url(helper.get_org(), domain, trail, url)))
    tangelo.log_info(results)
    return results
示例#19
0
def good_extraction(raw_text, entity_type, entity_value, url, domain):
    org = session_helper.get_org()
    success = db.add_extractor_feedback(org, domain, raw_text, entity_type, entity_value, url) == 0
    return json.dumps(dict(success=success))
示例#20
0
def deleteUser(users, startdate, enddate):
    org = helper.get_org()
    tangelo.log('deleteUser(' + users + ',' + startdate + ',' + enddate + ')')
    datawake_mysql.deleteUserData(org, users, startdate, enddate)
    return json.dumps(dict(success=True))
示例#21
0
def marked_entities(domain):
    user = session_helper.get_user()
    user_name = user.get_user_name()
    org = session_helper.get_org()
    marked_entities_list = db.get_marked_entities(org, domain, user_name)
    return json.dumps(dict(marked_entities=marked_entities_list))
示例#22
0
def getTrails():
    org = helper.get_org()
    results = datawake_mysql.getTrailsWithUserCounts(org)
    results.insert(0, {})
    return json.dumps(results)
示例#23
0
def listUsers():
    org = helper.get_org()
    return json.dumps(datawake_mysql.getActiveUsers(org))
示例#24
0
def delete_link_from_trail(domain, trail, url):
    org = helper.get_org()
    success = db.delete_link_from_trail(org, domain, trail, url) == 0
    return json.dumps(dict(success=success))
示例#25
0
def get_trail_entity_links(domain, trail):
    org = helper.get_org()
    links = json.dumps(dict(visited=db.get_visited_trail_entity_links(org, domain, trail), notVisited=db.get_not_visited_trail_entity_links(org, domain, trail)))
    return links
示例#26
0
def get_trail_based_entities(domain, trail):
    entities = db.get_trail_based_entities(helper.get_org(), domain, trail)
    irrelevantEntities = db.get_irrelevant_trail_based_entities(helper.get_org(), domain, trail)
    return json.dumps(dict(entities=entities, irrelevantEntities=irrelevantEntities))
示例#27
0
def get_url_entities(domain, trail, url):
    results = json.dumps(dict(entities=db.get_entities_on_url(helper.get_org(), domain, trail, url)))
    tangelo.log_info(results)
    return results
示例#28
0
def invalid_extraction(entity_type, entity_value, domain):
    user = session_helper.get_user()
    user_name = user.get_user_name()
    org = session_helper.get_org()
    success = db.mark_invalid_extracted_entity(user_name, entity_type, entity_value, domain, org) == 0
    return json.dumps(dict(success=success))
示例#29
0
def get_selections(domain, trail, url):
    org = helper.get_org()
    return json.dumps(
        dict(selections=db.getSelections(domain, trail, url, org)))
示例#30
0
def fetch_entities(domain, url):
    org = session_helper.get_org()
    entities = db.get_feedback_entities(org, domain, url)
    return json.dumps(dict(entities=entities))
示例#31
0
def good_extraction(raw_text, entity_type, entity_value, url, domain):
    org = session_helper.get_org()
    success = db.add_extractor_feedback(org, domain, raw_text, entity_type,
                                        entity_value, url) == 0
    return json.dumps(dict(success=success))
示例#32
0
def listUsers():
    org = helper.get_org()
    return json.dumps(datawake_mysql.getActiveUsers(org))
示例#33
0
def get_selections(domain, trail, url):
    org = helper.get_org()
    return json.dumps(dict(selections=db.getSelections(domain, trail, url, org)))
示例#34
0
def getGraph(name,
             startdate=u'',
             enddate=u'',
             users=u'',
             trail=u'*',
             domain=u''):
    org = helper.get_org()
    if trail == u'':
        trail = u'*'
    userlist = map(lambda x: x.replace('\"', '').strip(), users.split(','))
    userlist = filter(lambda x: len(x) > 0, userlist)
    #tangelo.log('getGraph( ' + str(name) + ',' + str(startdate) + ',' + str(enddate) + ',' + str(userlist) + ',' + str(trail) + ',' + str(domain) + ')')

    if name == 'browse path':
        graph = graph_helper.getBrowsePathEdges(org, startdate, enddate,
                                                userlist, trail, domain)
        return json.dumps(
            graph_helper.processEdges(graph['edges'], graph['nodes']))

    if name == 'browse path - with adjacent urls':
        graph = graph_helper.getBrowsePathAndAdjacentWebsiteEdgesWithLimit(
            org, startdate, enddate, 1, userlist, trail, domain)
        return json.dumps(
            graph_helper.processEdges(graph['edges'], graph['nodes']))

    if name == 'browse path - with adjacent urls min degree 2':
        graph = graph_helper.getBrowsePathAndAdjacentWebsiteEdgesWithLimit(
            org, startdate, enddate, 2, userlist, trail, domain)
        return json.dumps(
            graph_helper.processEdges(graph['edges'], graph['nodes']))

    if name == 'browse path - with adjacent phone #\'s':
        graph = graph_helper.getBrowsePathAndAdjacentPhoneEdgesWithLimit(
            org, startdate, enddate, 1, userlist, trail, domain)
        return json.dumps(
            graph_helper.processEdges(graph['edges'], graph['nodes']))

    if name == 'browse path - with adjacent email #\'s':
        graph = graph_helper.getBrowsePathAndAdjacentEmailEdgesWithLimit(
            org, startdate, enddate, 1, userlist, trail, domain)
        return json.dumps(
            graph_helper.processEdges(graph['edges'], graph['nodes']))

    if name == 'browse path - with text selections':
        graph = graph_helper.getBrowsePathWithTextSelections(
            org, startdate, enddate, userlist, trail, domain)
        return json.dumps(
            graph_helper.processEdges(graph['edges'], graph['nodes']))

    if name == 'browse path- with look ahead':
        graph = graph_helper.getBrowsePathWithLookAhead(
            org, startdate, enddate, userlist, trail, domain)
        return json.dumps(
            graph_helper.processEdges(graph['edges'], graph['nodes']))

    if name == 'browse path - with adjacent info':
        graph = graph_helper.getBrowsePathAndAdjacentInfoEdges(
            org, startdate, enddate, 1, userlist, trail, domain)
        return json.dumps(
            graph_helper.processEdges(graph['edges'], graph['nodes']))

    if name == 'OculusForensicRequest':
        rows = graph_helper.getOculusForensicGraph(org, startdate, enddate,
                                                   userlist, trail, domain)
        return json.dumps(rows)

    return json.dumps(dict(nodes=[], links=[]))
示例#35
0
def getTrails():
    org = helper.get_org()
    results = datawake_mysql.getTrailsWithUserCounts(org)
    results.insert(0, {})
    return json.dumps(results)
示例#36
0
def delete_link_from_trail(domain, trail, url):
    org = helper.get_org()
    success = db.delete_link_from_trail(org, domain, trail, url) == 0
    return json.dumps(dict(success=success))