def get_entities(trail_id): tangelo.log('Getting entities for trail: %s' % trail_id) entities = {} entityList = [] urls = [] rows = db.getBrowsePathUrls(trail_id) for row in rows: urls.append(row['url']) entity_data_connector = factory.get_entity_data_connector() results = entity_data_connector.get_extracted_entities_from_urls(urls) tangelo.log('Got entities') for result in results: for entityType in results[result]: for entityName in results[result][entityType]: if entityName in entities: entities[entityName]['pages'] = entities[entityName]['pages'] + 1 else: entities[entityName] = {'type': entityType, 'pages':1} # TODO either figure out how how map the data or do this differently for entity in entities: entityList.append({'name': entity, 'type': entities[entity]['type'], 'pages': entities[entity]['pages']}) return json.dumps(entityList)
def get(domain,trail,stars,newdomain): org = helper.get_org().upper() if not db.domain_exists(newdomain): db.add_new_domain(newdomain,'auto generated domain from trail: '+trail) features = set([]) url_set = set([]) stars = int(stars) # get all stared urls for the trail for (url,rank) in db.getRankedUrls(org,trail,domain): url_set.add(url) if stars < 1: urls = db.getBrowsePathUrls(org,trail) for url in urls: url_set.add(url) # get the list of invalid entities for the domain markedEntities = set([]) for (type,value) in db.get_marked_entities_for_domain(org, domain): markedEntities.add(value) # for each url get all extracted entities entity_data_connector = factory.get_entity_data_connector() all_entities = entity_data_connector.get_extracted_entities_from_urls(url_set) for url,featureDict in all_entities.iteritems(): for type,values in featureDict.iteritems(): type = type.replace(',',' ') filtered_values = [] for value in values: if value not in markedEntities: value = value.replace(',',' ') features.add(type+"\0"+value) # for each url get any manually extracted entities for url in url_set: for featureObj in db.get_feedback_entities(org, domain, url): type = featureObj['type'].replace(',',' ') value = featureObj['value'].replace(',',' ') features.add(type+"\0"+value) entity_data_connector.add_new_domain_items( map(lambda x: newdomain+'\0'+x,features))
def get(domain, trail, stars, newdomain): org = helper.get_org().upper() if not db.domain_exists(newdomain): db.add_new_domain(newdomain, 'auto generated domain from trail: ' + trail) features = set([]) url_set = set([]) stars = int(stars) # get all stared urls for the trail for (url, rank) in db.getRankedUrls(org, trail, domain): url_set.add(url) if stars < 1: urls = db.getBrowsePathUrls(org, trail) for url in urls: url_set.add(url) # get the list of invalid entities for the domain markedEntities = set([]) for (type, value) in db.get_marked_entities_for_domain(org, domain): markedEntities.add(value) # for each url get all extracted entities entity_data_connector = factory.get_entity_data_connector() all_entities = entity_data_connector.get_extracted_entities_from_urls( url_set) for url, featureDict in all_entities.iteritems(): for type, values in featureDict.iteritems(): type = type.replace(',', ' ') filtered_values = [] for value in values: if value not in markedEntities: value = value.replace(',', ' ') features.add(type + "\0" + value) # for each url get any manually extracted entities for url in url_set: for featureObj in db.get_feedback_entities(org, domain, url): type = featureObj['type'].replace(',', ' ') value = featureObj['value'].replace(',', ' ') features.add(type + "\0" + value) entity_data_connector.add_new_domain_items( map(lambda x: newdomain + '\0' + x, features))
def get_visited(trail_id): tangelo.log('Getting visited links for %s'%trail_id) results = db.getBrowsePathUrls(trail_id) return json.dumps(results)
def get(team_id,domain_id,trail_id,stars,extracted_features): """ :param team_id: id of the team :param domain_id: id of domain :param trail_id: id of the trail :param stars: minimum avg star ranking to allow in report :param extracted_features: y or n (include auto extracted features in report) :return: { trail: {id:..,name:...,description:.., url-count} urls: [ { url: .., url-visit-count: .. rank: {min: ?, max: ?, avg: ?, count: ?}, # rounded to nearest tenth auto_features: { type: [value,..],..}, manual_feature: { type: [value,..],..}, selections: {ts: "2015-02-25 16:24:41", selection: "selected text"} },.. ] } """ trailData = db.getTrailData(trail_id) trail_report = {} # get ranked urls for rankObject in db.getRankedUrls(trail_id): if rankObject['avg'] >= stars: url = rankObject['url'] del rankObject['url'] trail_report[url] = {'url': url, 'rank':rankObject} # get url hit counts and un ranked urls for urlObj in db.getBrowsePathUrls(trail_id): url = urlObj['url'] if url in trail_report or stars < 1: if url not in trail_report: trail_report[url] = {'url': url, 'rank':{'min':0,'max':0,'avg':0,'count':0} } trail_report[url]['url-visit-count'] = urlObj['count'] trailData['url-count'] = len(trail_report.keys()) # get the list of invalid entities for the domain if extracted_features != 'n': markedEntities = set([]) for featureObj in db.get_marked_features(trail_id): key = featureObj['type']+':'+featureObj['value'] markedEntities.add(key) # for each url get all extracted entities entity_data_connector = factory.get_entity_data_connector() all_entities = entity_data_connector.get_extracted_entities_from_urls(trail_report.keys()) for url,featureDict in all_entities.iteritems(): for type,values in featureDict.iteritems(): filtered_values = [] for value in values: key = type+':'+value if key not in markedEntities: filtered_values.append(value) if len(filtered_values) > 0: try: if 'auto_features' not in trail_report[url]: trail_report[url]['auto_features'] = {} trail_report[url]['auto_features'][type] = filtered_values except: tangelo.log("report generation error. skipping url.") continue # for each url get any manually extracted entities for url in trail_report.keys(): for featureObj in db.get_manual_features(trail_id,url): if 'manual_features' not in trail_report[url]: trail_report[url]['manual_features'] = {} if featureObj['type'] not in trail_report[url]['manual_features']: trail_report[url]['manual_features'][featureObj['type']] = [] trail_report[url]['manual_features'][featureObj['type']].append(featureObj['value']) # for each url get any highlighted text for url in trail_report.keys(): selections = db.getSelections(trail_id, url) # lets keep user names out of reports for now if len(selections) > 0: for sel in selections: del sel['userEmail'] trail_report[url]['selections'] = selections result = {'trail': trailData,'urls':trail_report.values()} return json.dumps(result,sort_keys=True,indent=4,separators=(',',':'))