Пример #1
0
def get(domain, trail):
    org = helper.get_org()

    trail_report = {}

    # get all stared urls for the trail
    for (url, rank) in db.getRankedUrls(org, trail, domain):
        trail_report[url] = {
            'rank': rank,
        }

    # get the list of invalid entities for the domain
    markedEntities = set([])
    for (type, value) in db.get_marked_entities_for_domain(org, domain):
        markedEntities.add(value)

    # for each url get all extracted entities
    entity_data_connector = factory.get_entity_data_connector()
    all_entities = entity_data_connector.get_extracted_entities_from_urls(
        trail_report.keys())
    for url, featureDict in all_entities.iteritems():
        for type, values in featureDict.iteritems():
            filtered_values = []
            for value in values:
                if value not in markedEntities:
                    filtered_values.append(value)
            if len(filtered_values) > 0:
                try:
                    if 'auto_features' not in trail_report[url]:
                        trail_report[url]['auto_features'] = {}
                    trail_report[url]['auto_features'][type] = filtered_values
                except:
                    tangelo.log("report generation error. skipping url.")
                    continue

    # for each url get any manually extracted entities
    for url in trail_report.keys():
        for featureObj in db.get_feedback_entities(org, domain, url):
            if 'manual_features' not in trail_report[url]:
                trail_report[url]['manual_features'] = {}
            if featureObj['type'] not in trail_report[url]['manual_features']:
                trail_report[url]['manual_features'][featureObj['type']] = []
            trail_report[url]['manual_features'][featureObj['type']].append(
                featureObj['value'])

    # for each url get any highlighted text
    for url in trail_report.keys():
        selections = db.getSelections(domain, trail, url, org)
        if len(selections) > 0:
            trail_report[url]['selections'] = selections

    result = {'trail': trail, 'urls': trail_report}
    return json.dumps(result, sort_keys=True, indent=4, separators=(',', ':'))
Пример #2
0
def getBrowsePathWithTextSelections(trail_id,startdate,enddate,userlist=[]):
    # first get the browse path
    graph = getBrowsePathEdges(trail_id,startdate,enddate,userlist)
    nodes = graph['nodes']
    edges = graph['edges']

    newnodes = {}
    try:
        # for each node in the browse path pull any related notes:
        for key,node in nodes.iteritems():
            selections = datawake_mysql.getSelections(trail_id, key)
            for selection in selections:
                ts = selection['ts']
                user = selection['userEmail']
                text = selection['selection']
                id = 'selection_'+str(user)+'_'+str(ts)
                node = {
                    'id':id,
                    'type':'selection',
                    'size':5,
                    'groupName':user,
                    'timestamps':[ts],
                    'userNames':[user],
                    'data':text
                }
                newnodes[id] = node
                edges.append((key,id))

        nodes.update(newnodes)

        #if len(userlist) == 1:
        #    nodes = addUrlRankstoNodes(org,nodes,userlist[0],trail,domain=domain)


        return {'nodes':nodes,'edges':edges}
    except:
        raise
Пример #3
0
def getBrowsePathWithTextSelections(trail_id, startdate, enddate, userlist=[]):
    # first get the browse path
    graph = getBrowsePathEdges(trail_id, startdate, enddate, userlist)
    nodes = graph['nodes']
    edges = graph['edges']

    newnodes = {}
    try:
        # for each node in the browse path pull any related notes:
        for key, node in nodes.iteritems():
            selections = datawake_mysql.getSelections(trail_id, key)
            for selection in selections:
                ts = selection['ts']
                user = selection['userEmail']
                text = selection['selection']
                id = 'selection_' + str(user) + '_' + str(ts)
                node = {
                    'id': id,
                    'type': 'selection',
                    'size': 5,
                    'groupName': user,
                    'timestamps': [ts],
                    'userNames': [user],
                    'data': text
                }
                newnodes[id] = node
                edges.append((key, id))

        nodes.update(newnodes)

        #if len(userlist) == 1:
        #    nodes = addUrlRankstoNodes(org,nodes,userlist[0],trail,domain=domain)

        return {'nodes': nodes, 'edges': edges}
    except:
        raise
Пример #4
0
def get_selections(domain, trail, url):
    org = helper.get_org()
    return json.dumps(dict(selections=db.getSelections(domain, trail, url, org)))
Пример #5
0
def get_selections(team_id, domain_id, trail_id, url):
    selections = db.getSelections(trail_id, url)
    return json.dumps(selections)
Пример #6
0
def get(team_id,domain_id,trail_id,stars,extracted_features):

    """

    :param team_id:  id of the team
    :param domain_id: id of domain
    :param trail_id: id of the trail
    :param stars: minimum avg star ranking to allow in report
    :param extracted_features: y or n (include auto extracted features in report)
    :return:
        {
            trail: {id:..,name:...,description:.., url-count}
            urls: [
                {
                    url: ..,
                    url-visit-count: ..
                    rank: {min: ?, max: ?, avg: ?, count: ?}, # rounded to nearest tenth
                    auto_features: { type: [value,..],..},
                    manual_feature: { type: [value,..],..},
                    selections: {ts: "2015-02-25 16:24:41", selection: "selected text"}
                },..
            ]
        }

    """


    trailData = db.getTrailData(trail_id)
    trail_report = {}


    # get ranked urls
    for rankObject in db.getRankedUrls(trail_id):
        if  rankObject['avg'] >= stars:
            url = rankObject['url']
            del rankObject['url']
            trail_report[url] = {'url': url, 'rank':rankObject}

    # get url hit counts and un ranked urls
    for urlObj in db.getBrowsePathUrls(trail_id):
        url = urlObj['url']
        if url in trail_report or stars < 1:
            if url not in trail_report:
                trail_report[url] = {'url': url, 'rank':{'min':0,'max':0,'avg':0,'count':0} }
            trail_report[url]['url-visit-count'] = urlObj['count']



    trailData['url-count'] = len(trail_report.keys())

    # get the list of invalid entities for the domain
    if extracted_features != 'n':
        markedEntities = set([])
        for featureObj in db.get_marked_features(trail_id):
            key = featureObj['type']+':'+featureObj['value']
            markedEntities.add(key)


        # for each url get all extracted entities
        entity_data_connector = factory.get_entity_data_connector()
        all_entities = entity_data_connector.get_extracted_entities_from_urls(trail_report.keys())
        for url,featureDict in all_entities.iteritems():
            for type,values in featureDict.iteritems():
                filtered_values = []
                for value in values:
                    key = type+':'+value
                    if key not in markedEntities:
                        filtered_values.append(value)
                if len(filtered_values) > 0:
                    try:
                        if 'auto_features' not in trail_report[url]: trail_report[url]['auto_features'] = {}
                        trail_report[url]['auto_features'][type] = filtered_values
                    except:
                        tangelo.log("report generation error. skipping url.")
                        continue


    # for each url get any manually extracted entities
    for url in trail_report.keys():
        for featureObj in db.get_manual_features(trail_id,url):
            if 'manual_features' not in trail_report[url]:
                trail_report[url]['manual_features'] = {}
            if featureObj['type'] not in trail_report[url]['manual_features']:
                trail_report[url]['manual_features'][featureObj['type']] = []
            trail_report[url]['manual_features'][featureObj['type']].append(featureObj['value'])


    # for each url get any highlighted text
    for url in trail_report.keys():
        selections = db.getSelections(trail_id, url)

        # lets keep user names out of reports for now
        if len(selections) > 0:
            for sel in selections:
                del sel['userEmail']
            trail_report[url]['selections'] = selections


    result = {'trail': trailData,'urls':trail_report.values()}
    return json.dumps(result,sort_keys=True,indent=4,separators=(',',':'))
Пример #7
0
def get_selections(domain, trail, url):
    org = helper.get_org()
    return json.dumps(
        dict(selections=db.getSelections(domain, trail, url, org)))
Пример #8
0
def get_selections(team_id,domain_id,trail_id,url):
    selections = db.getSelections(trail_id, url)
    return json.dumps(selections)