Exemplos de getBrowsePathUrls em Python, exemplos de datawake.util.db.datawake_mysql.getBrowsePathUrls em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: graphservice.py Projeto: nagyistge/Datawake

def get_entities(trail_id):
    tangelo.log('Getting entities for trail: %s' % trail_id)
    entities = {}
    entityList = []
    urls = []
    rows = db.getBrowsePathUrls(trail_id)
    for row in rows:
        urls.append(row['url'])

    entity_data_connector = factory.get_entity_data_connector()
    results = entity_data_connector.get_extracted_entities_from_urls(urls)

    tangelo.log('Got entities')

    for result in results:
        for entityType in results[result]:
            for entityName in results[result][entityType]:
                if entityName in entities:
                    entities[entityName]['pages'] = entities[entityName]['pages'] + 1
                else:
                    entities[entityName] = {'type': entityType, 'pages':1}
    # TODO either figure out how how map the data or do this differently
    for entity in entities:
        entityList.append({'name': entity, 'type': entities[entity]['type'], 'pages': entities[entity]['pages']})
    return json.dumps(entityList)

Exemplo n.º 2

0

Exibir arquivo

Arquivo: domaingen.py Projeto: brandontheis/Datawake

def get(domain,trail,stars,newdomain):
    org = helper.get_org().upper()

    if not db.domain_exists(newdomain):
        db.add_new_domain(newdomain,'auto generated domain from trail: '+trail)

    features = set([])
    url_set = set([])
    stars = int(stars)
    # get all stared urls for the trail


    for (url,rank) in db.getRankedUrls(org,trail,domain):
        url_set.add(url)

    if stars < 1:
        urls = db.getBrowsePathUrls(org,trail)
        for url in urls:
           url_set.add(url)


    # get the list of invalid entities for the domain
    markedEntities = set([])
    for (type,value) in db.get_marked_entities_for_domain(org, domain):
        markedEntities.add(value)


    # for each url get all extracted entities
    entity_data_connector = factory.get_entity_data_connector()
    all_entities = entity_data_connector.get_extracted_entities_from_urls(url_set)
    for url,featureDict in all_entities.iteritems():
        for type,values in featureDict.iteritems():
            type = type.replace(',',' ')
            filtered_values = []
            for value in values:
                if value not in markedEntities:
                    value = value.replace(',',' ')
                    features.add(type+"\0"+value)



    # for each url get any manually extracted entities
    for url in url_set:
        for featureObj in db.get_feedback_entities(org, domain, url):
            type = featureObj['type'].replace(',',' ')
            value = featureObj['value'].replace(',',' ')
            features.add(type+"\0"+value)





    entity_data_connector.add_new_domain_items( map(lambda x: newdomain+'\0'+x,features))

Exemplo n.º 3

0

Exibir arquivo

Arquivo: domaingen.py Projeto: nagyistge/Datawake

def get(domain, trail, stars, newdomain):
    org = helper.get_org().upper()

    if not db.domain_exists(newdomain):
        db.add_new_domain(newdomain,
                          'auto generated domain from trail: ' + trail)

    features = set([])
    url_set = set([])
    stars = int(stars)
    # get all stared urls for the trail

    for (url, rank) in db.getRankedUrls(org, trail, domain):
        url_set.add(url)

    if stars < 1:
        urls = db.getBrowsePathUrls(org, trail)
        for url in urls:
            url_set.add(url)

    # get the list of invalid entities for the domain
    markedEntities = set([])
    for (type, value) in db.get_marked_entities_for_domain(org, domain):
        markedEntities.add(value)

    # for each url get all extracted entities
    entity_data_connector = factory.get_entity_data_connector()
    all_entities = entity_data_connector.get_extracted_entities_from_urls(
        url_set)
    for url, featureDict in all_entities.iteritems():
        for type, values in featureDict.iteritems():
            type = type.replace(',', ' ')
            filtered_values = []
            for value in values:
                if value not in markedEntities:
                    value = value.replace(',', ' ')
                    features.add(type + "\0" + value)

    # for each url get any manually extracted entities
    for url in url_set:
        for featureObj in db.get_feedback_entities(org, domain, url):
            type = featureObj['type'].replace(',', ' ')
            value = featureObj['value'].replace(',', ' ')
            features.add(type + "\0" + value)

    entity_data_connector.add_new_domain_items(
        map(lambda x: newdomain + '\0' + x, features))

Exemplo n.º 4

0

Exibir arquivo

Arquivo: graphservice.py Projeto: nagyistge/Datawake

def get_visited(trail_id):
    tangelo.log('Getting visited links for %s'%trail_id)
    results = db.getBrowsePathUrls(trail_id)
    return json.dumps(results)

Exemplo n.º 5

0

Exibir arquivo

def get(team_id,domain_id,trail_id,stars,extracted_features):

    """

    :param team_id:  id of the team
    :param domain_id: id of domain
    :param trail_id: id of the trail
    :param stars: minimum avg star ranking to allow in report
    :param extracted_features: y or n (include auto extracted features in report)
    :return:
        {
            trail: {id:..,name:...,description:.., url-count}
            urls: [
                {
                    url: ..,
                    url-visit-count: ..
                    rank: {min: ?, max: ?, avg: ?, count: ?}, # rounded to nearest tenth
                    auto_features: { type: [value,..],..},
                    manual_feature: { type: [value,..],..},
                    selections: {ts: "2015-02-25 16:24:41", selection: "selected text"}
                },..
            ]
        }

    """


    trailData = db.getTrailData(trail_id)
    trail_report = {}


    # get ranked urls
    for rankObject in db.getRankedUrls(trail_id):
        if  rankObject['avg'] >= stars:
            url = rankObject['url']
            del rankObject['url']
            trail_report[url] = {'url': url, 'rank':rankObject}

    # get url hit counts and un ranked urls
    for urlObj in db.getBrowsePathUrls(trail_id):
        url = urlObj['url']
        if url in trail_report or stars < 1:
            if url not in trail_report:
                trail_report[url] = {'url': url, 'rank':{'min':0,'max':0,'avg':0,'count':0} }
            trail_report[url]['url-visit-count'] = urlObj['count']



    trailData['url-count'] = len(trail_report.keys())

    # get the list of invalid entities for the domain
    if extracted_features != 'n':
        markedEntities = set([])
        for featureObj in db.get_marked_features(trail_id):
            key = featureObj['type']+':'+featureObj['value']
            markedEntities.add(key)


        # for each url get all extracted entities
        entity_data_connector = factory.get_entity_data_connector()
        all_entities = entity_data_connector.get_extracted_entities_from_urls(trail_report.keys())
        for url,featureDict in all_entities.iteritems():
            for type,values in featureDict.iteritems():
                filtered_values = []
                for value in values:
                    key = type+':'+value
                    if key not in markedEntities:
                        filtered_values.append(value)
                if len(filtered_values) > 0:
                    try:
                        if 'auto_features' not in trail_report[url]: trail_report[url]['auto_features'] = {}
                        trail_report[url]['auto_features'][type] = filtered_values
                    except:
                        tangelo.log("report generation error. skipping url.")
                        continue


    # for each url get any manually extracted entities
    for url in trail_report.keys():
        for featureObj in db.get_manual_features(trail_id,url):
            if 'manual_features' not in trail_report[url]:
                trail_report[url]['manual_features'] = {}
            if featureObj['type'] not in trail_report[url]['manual_features']:
                trail_report[url]['manual_features'][featureObj['type']] = []
            trail_report[url]['manual_features'][featureObj['type']].append(featureObj['value'])


    # for each url get any highlighted text
    for url in trail_report.keys():
        selections = db.getSelections(trail_id, url)

        # lets keep user names out of reports for now
        if len(selections) > 0:
            for sel in selections:
                del sel['userEmail']
            trail_report[url]['selections'] = selections


    result = {'trail': trailData,'urls':trail_report.values()}
    return json.dumps(result,sort_keys=True,indent=4,separators=(',',':'))