Пример #1
0
def extract_relevant_cybergeo_fulltext(kwLimit):
    resdir = "res/cybergeo_full/"
    #
    corpus = utils.get_data(
        "SELECT id FROM cybergeo WHERE fulltext_keywords IS NOT NULL AND fulltext_keywords!='' LIMIT 10;", "mysql"
    )
    occurence_dicos = utils.import_kw_dico_req(
        "SELECT id,fulltext_keywords FROM cybergeo WHERE fulltext_keywords IS NOT NULL AND fulltext_keywords!='' LIMIT 10;",
        "mysql",
    )
    [relevantkw, relevant_dico] = kwFunctions.extract_relevant_keywords(corpus, kwLimit, occurence_dicos)
    # export as csv
    utils.export_dico_csv(relevant_dico, resdir + "relevantDico_kw" + str(kwLimit), False)
    export_dico_num_csv(relevantkw, resdir + "termhoods_kw" + str(kwLimit), False)
Пример #2
0
def extract_relevant_cybergeo(kwLimit, database):
    corpus = utils.get_data(
        "SELECT cybergeo.id FROM refdesc INNER JOIN cybergeo ON cybergeo.id=refdesc.id WHERE abstract_keywords IS NOT NULL AND abstract_keywords!='';",
        database,
    )
    print(corpus)
    occurence_dicos = utils.import_kw_dico_req(
        "SELECT cybergeo.id,abstract_keywords FROM refdesc INNER JOIN cybergeo ON cybergeo.id=refdesc.id WHERE abstract_keywords IS NOT NULL AND abstract_keywords!='';",
        database,
    )
    print(occurence_dicos)
    [relevantkw, relevant_dico] = kwFunctions.extract_relevant_keywords(corpus, kwLimit, occurence_dicos)
    utils.export_dico_csv(relevant_dico, "res/cybergeo/relevantDico_kwLimit" + str(kwLimit), False)
    utils.export_dico_num_csv(relevantkw, "res/cybergeo/kw_" + str(kwLimit), False)
Пример #3
0
def relevant_full_corpus(kwLimit):
    #corpus = utils.get_data('SELECT id FROM refdesc WHERE abstract_keywords IS NOT NULL;','../../Data/dumps/20160224_cybergeo.sqlite3')
    corpus = utils.get_ids('cybergeo','keywords')
    occurence_dicos = utils.import_kw_dico('cybergeo','keywords')
    mongo = pymongo.MongoClient('localhost',27017)
    database = mongo['relevant']
    relevant = 'relevant_full_'+str(kwLimit)
    network = 'network_full_'+str(kwLimit)+'_eth10'
    database[relevant].delete_many({"cumtermhood":{"$gt":0}})
    database[relevant].create_index('keyword')
    [keywords,dico,frequencies,edge_list] = kwFunctions.extract_relevant_keywords(corpus,kwLimit,occurence_dicos)
    print('insert relevant...')
    for kw in keywords.keys():
        butils.update_kw_tm(kw,keywords[kw],frequencies[kw],math.log(keywords[kw])*math.log(len(corpus)/frequencies[kw]),database,relevant)
    print('insert edges...')
    database[network].delete_many({"weight":{"$gt":0}})
    database[network].insert_many(edge_list)
Пример #4
0
def extract_relevant_cybergeo(kwLimit, database):
    corpus = utils.get_data(
        'SELECT cybergeo.id FROM refdesc INNER JOIN cybergeo ON cybergeo.id=refdesc.id WHERE abstract_keywords IS NOT NULL AND abstract_keywords!=\'\';',
        database)
    print(corpus)
    occurence_dicos = utils.import_kw_dico_req(
        'SELECT cybergeo.id,abstract_keywords FROM refdesc INNER JOIN cybergeo ON cybergeo.id=refdesc.id WHERE abstract_keywords IS NOT NULL AND abstract_keywords!=\'\';',
        database)
    print(occurence_dicos)
    [relevantkw, relevant_dico
     ] = kwFunctions.extract_relevant_keywords(corpus, kwLimit,
                                               occurence_dicos)
    utils.export_dico_csv(relevant_dico,
                          'res/cybergeo/relevantDico_kwLimit' + str(kwLimit),
                          False)
    utils.export_dico_num_csv(relevantkw, 'res/cybergeo/kw_' + str(kwLimit),
                              False)
Пример #5
0
def extract_relevant_cybergeo_fulltext(kwLimit):
    resdir = 'res/cybergeo_full/'
    #
    corpus = utils.get_data(
        'SELECT id FROM cybergeo WHERE fulltext_keywords IS NOT NULL AND fulltext_keywords!=\'\' LIMIT 10;',
        'mysql')
    occurence_dicos = utils.import_kw_dico_req(
        'SELECT id,fulltext_keywords FROM cybergeo WHERE fulltext_keywords IS NOT NULL AND fulltext_keywords!=\'\' LIMIT 10;',
        'mysql')
    [relevantkw, relevant_dico
     ] = kwFunctions.extract_relevant_keywords(corpus, kwLimit,
                                               occurence_dicos)
    # export as csv
    utils.export_dico_csv(relevant_dico,
                          resdir + 'relevantDico_kw' + str(kwLimit), False)
    export_dico_num_csv(relevantkw, resdir + 'termhoods_kw' + str(kwLimit),
                        False)
Пример #6
0
def bootstrap_subcorpuses(corpus,occurence_dicos,kwLimit,subCorpusSize,bootstrapSize):
    N = len(corpus)

    print('Bootstrapping on corpus of size '+str(N))

    # generate bSize extractions
    #   -> random subset of 1:N of size subCorpusSize
    extractions = [map(lambda x : x - 1,numpy.random.choice(N,subCorpusSize,replace=False)) for b in range(bootstrapSize)]

    # numpy.random.choice(N, size, replace=False)

    mean_termhoods = dict() # mean termhoods progressively updated
    ref_kw_dico = dict() # ref -> kw dico : cumulated on repetitions. if a kw is relevant a few time, counted as 0 in mean.

    allkw = []

    for eind in range(len(extractions)) :
        print("bootstrap : run "+str(eind))
        extraction = extractions[eind]
        subcorpus = [corpus[i] for i in extraction]
        [keywords,ref_kw_local_dico] = kwFunctions.extract_relevant_keywords(subcorpus,kwLimit,occurence_dicos)

	allkw.append(keywords)

        # add termhoods
        for kw in keywords.keys() :
            if kw not in mean_termhoods : mean_termhoods[kw] = 0
            mean_termhoods[kw] = mean_termhoods[kw] + keywords[kw]

        # update p->kw dico
        for ref in ref_kw_local_dico.keys() :
            if ref not in ref_kw_dico : ref_kw_dico[ref] = set()
            for kw in ref_kw_local_dico[ref] :
		       ref_kw_dico[ref].add(kw)

    # sort on termhoods (no need to normalize) adn returns
    res = kwFunctions.extract_from_termhood(mean_termhoods,ref_kw_dico,kwLimit)
    #print(res)
    #print(allkw)
    res.append(allkw)
    return(res)
Пример #7
0
def relevant_full_corpus(kwLimit):
    #corpus = utils.get_data('SELECT id FROM refdesc WHERE abstract_keywords IS NOT NULL;','../../Data/dumps/20160224_cybergeo.sqlite3')
    corpus = utils.get_ids('cybergeo', 'keywords')
    occurence_dicos = utils.import_kw_dico('cybergeo', 'keywords')
    mongo = pymongo.MongoClient('localhost', 27017)
    database = mongo['relevant']
    relevant = 'relevant_full_' + str(kwLimit)
    network = 'network_full_' + str(kwLimit) + '_eth10'
    database[relevant].delete_many({"cumtermhood": {"$gt": 0}})
    database[relevant].create_index('keyword')
    [keywords, dico, frequencies, edge_list
     ] = kwFunctions.extract_relevant_keywords(corpus, kwLimit,
                                               occurence_dicos)
    print('insert relevant...')
    for kw in keywords.keys():
        butils.update_kw_tm(
            kw, keywords[kw], frequencies[kw],
            math.log(keywords[kw]) * math.log(len(corpus) / frequencies[kw]),
            database, relevant)
    print('insert edges...')
    database[network].delete_many({"weight": {"$gt": 0}})
    database[network].insert_many(edge_list)