Exemplo n.º 1
0
def run():

    # Bootstrap logging
    boot_logging()

    # Load configuration
    configFile = LoadConfig()

    # Run this step only if enabled
    if configFile.FormateExportCountryCartography:

        # Get some paths from configuration
        storage_path = configFile.ResultBiblioPath
        output_path = configFile.ResultPath

        # Compute prefixes
        prefixes = [""]
        if configFile.GatherFamilly:
            prefixes.append("Families")

        # Build maps for all prefixes
        for prefix in prefixes:

            # Status message
            label = label_from_prefix(prefix)
            logger.info(
                "Generating maps about applicants' and inventors' origin countries for {}. "
                .format(label))

            # Compute storage slot
            storage_name = prefix + configFile.ndf

            # Read data from storage, generate map and save JSON and HTML to filesystem
            generate_map(storage_path, storage_name, output_path)

        # Due to limit of D3, countries resources are necessary placed
        # in same working directory... other solution is to start an http server
        # http://stackoverflow.com/questions/17077931/d3-samples-in-a-microsoft-stack

        # Clone required resources into result directory
        shutil.copy('countries.json',
                    os.path.join(output_path, "countries.json"))
Exemplo n.º 2
0
def main():

    config = LoadConfig()
    target_path = get_target_path()

    ProcessList(config.ndf).reset()
    FusionList(config.ndf).reset()

    run_spliter = Popen(
        ['python', 'Patent2Net/scripts/run_spliter.py', target_path])
    run_spliter.wait()
    process_list = Popen(
        ['python', 'Patent2Net/scripts/process_list.py', target_path])
    process_list.wait()
    fusion_patents = Popen(
        ['python', 'Patent2Net/scripts/fusion_patents.py', target_path])
    fusion_patents.wait()

    destination = os.path.join("..",
                               target_path.replace(".cql", "") + "_fusion.cql")
    config = "--config=" + destination
    Popen(['p2n', 'run', config])
Exemplo n.º 3
0
def get_one_request(p2n_dir):
    dex = get_current_dex()

    configFile = LoadConfig(p2n_dir + ".cql")

    return get_success_response(
        "", {
            "done": p2n_dir in dex["done"],
            "state": get_state(p2n_dir),
            "data": get_directory_request_data_all(p2n_dir),
            "progress": get_data_progress(p2n_dir),
            "directory": p2n_dir,
            "cql": {
                "requete": configFile.requete,
                "ndf": configFile.ndf,
                "options": {
                    "GatherContent": configFile.GatherContent,
                    "GatherBiblio": configFile.GatherBiblio,
                    "GatherPatent": configFile.GatherPatent,
                    "GatherFamilly": configFile.GatherFamilly
                }
            }
        })
Exemplo n.º 4
0
def run():

    aujourd = datetime.date.today()

    configFile = LoadConfig()
    requete = configFile.requete
    ndf = configFile.ndf
    Gather = configFile.GatherContent
    GatherBiblio = configFile.GatherBiblio
    GatherPatent = configFile.GatherPatent
    GatherFamilly = configFile.GatherFamilly
    IsEnableScript = configFile.FormateExportDataTable

    #should set a working dir one upon a time... done it is temporPath
    ListBiblioPath = configFile.ResultBiblioPath
    temporPath = configFile.temporPath
    ResultPathContent = configFile.ResultPath

    ResultListPath = configFile.ResultListPath
    ResultBiblioPath = configFile.ResultBiblioPath
    # Lecture du fichier de référence

    lstApplic = []

    Inventeurs = []
    Applicants = []
    nbAppliAvant = dict()
    nbInvAvant = dict()

    # traitement des fichiers + familles
    if GatherFamilly:
        PU = [ndf, 'Families' + ndf]
    else:
        PU = [ndf]

    for fic in PU:
        if 'Old' + fic not in os.listdir(ResultBiblioPath):
            cptInv, cptAppl = 0, 0

            print(
                "\n> Hi! This is Pre Process for filtering equivalents patents from dataset gathered by P2N-OPSGather: used on:",
                fic)
            if 'Description' + fic in os.listdir(ListBiblioPath):
                with open(ListBiblioPath + '//' + fic, 'r',
                          encoding="utf8") as data:
                    dico = LoadBiblioFile(ListBiblioPath, fic)
            else:  # Retrocompatibility
                print("please use Comptatibilizer")
                sys.exit()
            LstBrevet = dico['brevets']
            # patent filtering process
            Filtres = []
            dejaVus = []
            LabBrevets = [brev['label'] for brev in LstBrevet]
            for bre in LstBrevet:  # parcours de la listee des brevets
                if bre['label'] not in dejaVus:  # si pas vu
                    dejaVus.append(bre['label'])  #rajout aux vus

                    # parcours des equivalents. Deux cas : une liste ou un chaine non vide
                    if isinstance(bre['equivalents'],
                                  list) and len(bre['equivalents']) > 0:
                        # récupération de la liste des dates  de chaque équvalents
                        dates = []
                        for brev in bre['equivalents']:
                            if brev in LabBrevets:  # si celui-ci fait partie des brevets de départ (sinon ignoré)
                                for brevet in LstBrevet:  # on va le chercher
                                    if brevet[
                                            'label'] == brev:  # yes c'est lui !!!
                                        if isinstance(
                                                brevet["date"], list
                                        ):  # les dates sont quelquefois en liste OU en chaine :-()
                                            date = min(brevet["date"])
                                        else:
                                            date = brevet["date"]
                                        # on rajoute à la structure adhoc : date, brevet, taille (nb de caractères)
                                        if len(date) < 4:
                                            print("Aille")
                                        dates.append((date, bre,
                                                      len(str(bre.values()))))
                                        #dates.extend((brevet["date"][0], brevet, len(str(brevet.values()))) for brevet in LstBrevet if brevet['label'] == brev )
                                dejaVus.append(brev)

                        if len(dates) == 1:  # pas d'ambiguité
                            Filtres.append(dates[0][1])
                        elif len(dates) > 1:  #récupération du plus vieux
                            MiniDate = min([dat for dat, brev, val in dates])
                            MaxVal = max(val for dat, brev, val in dates)
                            if len(MiniDate) < 5:
                                print(bre['prior-Date'], ' -- > ', MiniDate)
                            # giving priority to the first in date apparition maximizing lenght (su^^posed to be fields with information)
                            candidat = [
                                bre for dat, bre, val in dates
                                if dat == MiniDate and val == MaxVal
                            ]
                            if len(candidat
                                   ) == 0:  # ou au max d'apport informationnel
                                # if it doens't work giving priority to max information content
                                candidat = [
                                    brev for dat, brev, val in dates
                                    if val == MaxVal
                                ]
                                if len(candidat) > 1:
                                    priorDateMin = min([
                                        min(brevet["prior-Date"])
                                        for brevet in candidat
                                    ])
                                    NewCandidat = [
                                        brev for brev in candidat
                                        if priorDateMin in brev["prior-Date"]
                                    ]
                                    if len(NewCandidat) > 1:
                                        NewCandidat = NewCandidat[0]
                                        Filtres.append(NewCandidat)
                                    else:
                                        print("pffff")
                            else:  #aucun des équivalents dans la liste
                                Filtres.append(candidat[0])
                        else:
                            Filtres.append(bre)

                    elif isinstance(bre['equivalents'],
                                    str) and len(bre['equivalents']) > 0:
                        #len(bre ['equivalents'])>0 and bre ['equivalents'] in LabBrevets:
                        if bre['equivalents'] in LabBrevets:
                            brevet = [
                                brev for brev in LstBrevet
                                if brev['label'] == bre['equivalents']
                            ][0]
                            if isinstance(
                                    brevet["date"], list
                            ):  # les dates sont quelquefois en liste OU en chaine :-()
                                date = min(brevet["date"])
                            else:
                                date = brevet["date"]
                            if len(date) < 4:
                                print("Aille")

                            dates = [(date, brevet, len(brevet.values()))]

                            if isinstance(
                                    bre["date"], list
                            ):  # les dates sont quelquefois en liste OU en chaine :-()
                                date = min(bre["date"])
                            else:
                                date = bre["date"]

                            # joining currend patent
                            dates.append((date, bre, len(bre.values())))

                            MiniDate = min([dat for dat, bre, val in dates])
                            MaxVal = max([val for dat, bre, val in dates])
                            candidat = [
                                bre for dat, bre, val in dates
                                if dat == MiniDate and val == MaxVal
                            ]
                            if len(candidat) > 1:
                                pass
                            elif len(candidat) == 0:
                                # if it doens't work giving priority to max information content
                                candidat = [
                                    brev for dat, brev, val in dates
                                    if val == MaxVal
                                ]
                                if len(candidat) > 1:
                                    priorDateMin = min([
                                        min(brevet["prior-Date"])
                                        for brevet in candidat
                                    ])
                                    NewCandidat = [
                                        brev for brev in candidat
                                        if priorDateMin in brev["prior-Date"]
                                    ]
                                    if len(NewCandidat) > 1:
                                        NewCandidat = NewCandidat[0]
                                        Filtres.append(NewCandidat)
                                    else:
                                        print("pffff")
                            else:
                                Filtres.append(candidat[0])

                        else:  #equivalent pas dans le corpus
                            Filtres.append(bre)

                    else:
                        Filtres.append(bre)
                    for dat, brevet, val in dates:
                        dejaVus.append(brevet['label'])

            # joining lost patents
            LabFiltered = []
            for bre in Filtres:
                if isinstance(bre["label"], str):
                    LabFiltered.append(bre['label'])
                else:
                    LabFiltered.append(bre['label'][0])

            EquivFiltered = []
            for bre in Filtres:
                for pat in bre['equivalents']:
                    EquivFiltered.append(pat)

            complement = [bre for bre in LstBrevet \
                    if bre ['label'] not in LabFiltered \
                    and sum([eq in EquivFiltered for eq in bre["equivalents"]]) ==0]

            NewFilt = []
            DejaVus = []
            for bre in Filtres:
                if isinstance(bre['label'], list):
                    bre['label'] = bre['label'][0]
                if bre['label'] not in DejaVus:
                    equi = []
                    cpFilt = copy.copy(Filtres)
                    cpFilt.remove(bre)
                    for bre1 in cpFilt:
                        if isinstance(bre1['equivalents'], list):
                            for eq in bre1['equivalents']:
                                if len(eq) > 0 and eq != 'empty':
                                    equi.append(eq)
                        elif len(bre1['equivalents']
                                 ) > 1 and bre1['equivalents'] != 'empty':
                            equi.append(bre1['equivalents'])
                        else:
                            pass
                    if len(bre['equivalents']
                           ) > 0 and bre['equivalents'] != 'empty':
                        res = sum([pat in equi for pat in bre['equivalents']] +
                                  [bre['label'] in equi])
                    if res > 0:
                        tempo = [
                            (bre2['date'], bre2, len(bre2.values()))
                            for bre2 in cpFilt if bre['equivalents']
                        ]  # on pourrait direct aller là et tester sur la taille de tempo :-/
                        tempo2 = []
                        for dat, brevet, val in tempo:
                            if isinstance(dat, str):
                                tempo2.append((dat, brevet, val))
                            elif isinstance(dat, list):
                                for truc in dat:
                                    if len(truc) > 0:
                                        tempo2.append((truc, brevet, val))
                                    else:
                                        pass
                        tempo = tempo2
                        dates = []
                        valeurs = []
                        for dat, brevet, val in tempo:
                            dates.append(dat)
                            valeurs.append(val)
                        miniDate = min(dates)
                        maxVal = max(valeurs)
                        tempo2 = [
                            bre for dat, brevet, val in tempo
                            if dat == miniDate and val == maxVal
                        ]
                        if len(tempo2) > 0:
                            NewFilt.append(tempo2[0])
                            if isinstance(tempo2[0]['equivalents'], list):
                                for eq in tempo2[0]['equivalents']:
                                    DejaVus.append(eq)
                            elif tempo2[0]['equivalents'] != 'empty':
                                DejaVus.append(tempo2[0]['equivalents'])
                        else:
                            tempo2 = [
                                bre for dat, brevet, val in tempo
                                if val == maxVal
                            ]
                            if len(tempo2) > 0:
                                NewFilt.append(tempo2[0])
                                if isinstance(tempo2[0]['equivalents'], list):
                                    for eq in tempo2[0]['equivalents']:
                                        DejaVus.append(eq)
                                elif tempo2[0]['equivalents'] != 'empty':
                                    DejaVus.append(tempo2[0]['equivalents'])

                                else:
                                    pass
                            else:
                                NewFilt.append(bre)

                    else:
                        NewFilt.append(bre)
                    if isinstance(bre['label'], str):
                        DejaVus.append(bre['label'])
                    else:
                        for lab in bre['label']:
                            DejaVus.append(lab)
                    if isinstance(bre['equivalents'], list):
                        for eq in bre['equivalents']:
                            DejaVus.append(eq)
                    elif bre['equivalents'] != 'empty':
                        DejaVus.append(bre['equivalents'])
                else:
                    pass

            EquivFiltered = []
            cpFilt = copy.copy(Filtres)
            for bre in Filtres:
                if not isinstance(bre['label'], str):
                    if len(bre['label']) > 0:
                        bre['label'] = bre['label'][0]
                    else:
                        print("no label !!!!")
                else:
                    pass
            toRemove = []
            for bre in Filtres:
                if not isinstance(bre['equivalents'], list):
                    if len(bre['equivalents']
                           ) and bre['equivalents'] != 'empty':
                        bre['equivalents'] = [bre['equivalents']]
                    else:
                        bre['equivalents'] = []
                for pat in bre['equivalents']:
                    if pat != bre['label']:
                        if pat not in EquivFiltered:
                            EquivFiltered.append(pat)
                        else:
                            cpFilt = [
                                brev for brev in cpFilt
                                if brev['label'] != bre['label']
                            ]
                            toRemove.append((bre['label'], bre))
            exclude = [truc for truc, muche in toRemove]
            Resultat = []
            for bre in Filtres:
                if bre['label'] not in exclude:
                    Resultat.append(bre)

            EquivFiltered2 = []
            for bre in Resultat:
                for pat in bre['equivalents']:
                    EquivFiltered2.append(pat)

            print("net set of equivalent covered: ", len(EquivFiltered2))
            print(len(LstBrevet), '  --> ', len(Filtres), ' --> ',
                  len(Resultat))
            print("Good, ", len(Resultat + complement),
                  " patents filterd from equivalent unicity exrtracted from ",
                  fic)
            #Saving file
            for brev in Resultat:
                with open(ResultBiblioPath + '//tempo' + fic, 'ab') as ficRes:
                    pickle.dump(brev, ficRes)
            os.rename(ResultBiblioPath + '//' + fic,
                      ResultBiblioPath + '//Old' + fic)
            os.rename(ResultBiblioPath + '//tempo' + fic,
                      ResultBiblioPath + '//' + fic)
Exemplo n.º 5
0
# -*- coding: utf-8 -*-
"""
Created on Wed May 24 08:00:33 2017
This script load the xml IPCR descriptions text from Wipo (ipcr-2015.xml) and 
a patent universe from P2N (a list of patent according to a request).
It develops "Augmented Abstracts" consisting of each abstracts completed with
the sum of the first deepers classifications descriptions text (up to the section level) found 
in the patent metadata
@author: dreymond
"""

from lxml import etree
from Patent2Net.P2N_Lib import LoadBiblioFile, symbole
from Patent2Net.P2N_Config import LoadConfig
import sys, os, codecs
configFile = LoadConfig()
requete = configFile.requete
ndf = configFile.ndf
 #should set a working dir one upon a time... done it is temporPath
ResultPath = configFile.ResultBiblioPath
temporPath = configFile.temporPath
ResultContentsPath = configFile.ResultContentsPath
ResultBiblioPath = configFile.ResultBiblioPath
ResultPathContent = '..//DATA//'+ndf+'//PatentContents'

#Setting wether or not we use only primary classification
Primar = True
#Setting cache for performance purposes
CIB = dict()

if 'Description'+ndf or 'Description'+ndf.lower() in os.listdir(ResultBiblioPath): # NEW 12/12/15 new gatherer append data to pickle file in order to consume less memory
Exemplo n.º 6
0
def run():

    # Bootstrap logging
    boot_logging()

    # Load configuration
    config = LoadConfig()

    # Run this only if enabled
    if not config.GatherImages:
        return

    # Get some information from configuration
    expression = config.requete
    storage_basedir = config.ResultBiblioPath
    storage_dirname = config.ndf
    output_path = config.ResultPathImages

    # Compute prefixes
    prefixes = [""]
    if config.GatherFamilly:
        prefixes.append("Families")

    # Build maps for all prefixes
    for prefix in prefixes:

        # Status message
        label = label_from_prefix(prefix)
        logger.info("Generating gallery of drawings for {}. ".format(label))

        # Compute storage slot using prefix and DataDirectory
        # e.g. "Lentille" vs. "FamiliesLentille"
        storage_name = prefix + storage_dirname

        # Load bibliographic data
        biblio_file = LoadBiblioFile(storage_basedir, storage_name)

        # Generate thumbnails
        gallery = []
        patents = biblio_file['brevets']
        cpt = 0
        for patent in patents:
            cpt + 1
            AnnonceProgres(Appli='p2n_image',
                           valMax=100,
                           valActu=90 + cpt * 10 / len(patents))
            patent_label = get_patent_label(patent)
            i = 1
            logger.info('Processing patent {}'.format(patent_label))
            path_img_base = '{}//{}-{}.tiff'.format(output_path, patent_label,
                                                    '{}')
            path = path_img_base.format(i)
            while os.path.exists(path):
                thumb, orig, tiff = generate_thumbnails(path)
                gallery.append({
                    "_id": '{}-{}'.format(patent_label, i),
                    'thumb': thumb,
                    'orig': orig,
                    'label': patent['title'],
                    'ipcr7': patent['IPCR7'],
                    'code': patent_label,
                    'tiff': tiff,
                })
                i += 1
                path = path_img_base.format(i)

        # Render gallery
        AnnonceProgres(Appli='p2n_image', valMax=100, valActu=100)
        RenderTemplate(
            'ModeleImages.html',
            output_path + '/index' + prefix + '.html',
            request=expression.replace('"', ''),
            gallery=gallery,
            json=json.dumps(gallery),
        )
Exemplo n.º 7
0
def main():
    configFile = LoadConfig()

    RequestOrig = configFile.requete
    directory = configFile.ndf

    today = datetime.datetime.today()

    read_dex()
    to_be_found = get_data_to_be_found(directory)

    if to_be_found == None:
        print("Vous devez d'abord verifier si la requete doit être découpée")
        return None

    need_spliter = to_be_found["need_spliter"]
    lstFicOk = to_be_found["lstFicOk"]

    if need_spliter != True:
        print("Cette requete n'a pas besoin d'être découpée")
        return None
    
    dateDeb = get_data_spliter_start_date(directory)

    if dateDeb == None:
        dateDeb=1900 #print("Vous devez préciser la date de début pour découper la requete")
        # return None

    targetDirectory = REQUEST_AUTO_FOLDER + directory
    if not os.path.exists(targetDirectory):
        os.makedirs(targetDirectory)

    Request = RequestOrig + ' AND PD=date' 
    DataDir = directory + '_segments_'
    
    delete_data_spliter(directory)
    set_spliter_result_start(directory)

    jourOk, moisOk, ipcOk = False, False, False

    Total =0
    nbFiles = 0

    fic =open("Patent2Net/REQUESTS/requestModel.cql", 'r')#requestModel.cql
    DataReq = targetDirectory
    data = fic.read()
    fic.close()

    print("Start for")
    for AN in range(dateDeb, today.year+1,1):  
        print(AN)
        Trouves = checkRequest(Request.replace('=date', '='+str(AN)))
        if 2000>Trouves>0:
            Total += Trouves
            # a request for that year is ok
            monthOk = False
            ipcOk = False
            Request2 = Request.replace('=date', '='+str(AN))
            data2 = data.replace("***requete***", Request2)
            data2 = data2.replace("***dataDir***", DataDir+str(AN))
            NameFic = str(AN)+'Request.cql'
            with open(DataReq+"/"+NameFic, "w") as ficRes: #+"-"+ipc
                print(ficRes.name.split('/')[1])
                if ficRes.name.split('/')[1] not in lstFicOk:
                    ficRes.write(data2)
                nbFiles +=1
                print (ficRes.name, 'file written, ', Trouves,' patents expected and ', Total, ' cumulative.' )
                add_spliter_result(directory, ficRes.name, str(AN), Trouves)
                set_spliter_cumulative(directory, Total)
        if Trouves == 0:
            monthOk = False
            ipcOk = False
            jourOk = False
            #nothing to do 
        if Trouves >= 2000:
            # we have to split by monthes
            monthOk = True
            jourOk = False
                
            cpt= 0 #used as monthes
            
            for month in Months.keys():
                cpt +=1
                if len(str(cpt))<2: # monthes are numbered thanks to cpt (ugly isn't it ?)
                    mois = '0'+str(cpt)
                else:
                    mois = str(cpt)
                Request2 = Request.replace('=date', '='+str(AN)+mois)
                Trouves = checkRequest(Request2)
                if 2000>Trouves>0:
                    Total += Trouves
                    # OK less than 2000 and more than 0 go ahead for that request
                    ipcOk = False
                    jourOk = False
                    data2 = data.replace("***requete***", Request2)
                    data2 = data2.replace("***dataDir***", DataDir+str(AN)+mois)
                    NameFic = str(AN)+mois+'Request.cql'
                    if NameFic not in lstFicOk:
                        with open(DataReq+"/"+NameFic, "w") as ficRes: #+"-"+ipc 
                        
                            ficRes.write(data2)
                        nbFiles +=1
                        print (ficRes.name, 'file written, ', Trouves,' patents expected and ', Total, ' cumulative.' )
                        add_spliter_result(directory, ficRes.name, str(AN), Trouves)
                        set_spliter_cumulative(directory, Total)
                if Trouves == 0:
                    ipcOk = False
                    jourOk = False
                    #nothing to do 
                if Trouves >= 2000:
                    monthOk = True
                    jourOk = True
                    ipcOk = False
                    # spliting days for that month
                    for day in range(1, Months[month]+1):
                        if len(str(day))<2:
                            jour = '0'+str(day)  
                        else:
                            jour = str(day)
                        Request2 = Request.replace('=date', '='+str(AN)+mois+jour)
                        Trouves = checkRequest(Request2)
                        if 2000>Trouves>0:
                            Total += Trouves
                            # go ahead for that day
                            ipcOk = False
                            data2 = data.replace("***requete***", Request2)
                            data2 = data2.replace("***dataDir***", DataDir+str(AN)+mois+jour)
                            NameFic = str(AN)+mois+jour+'Request.cql'
                            if NameFic not in lstFicOk:
                                with open(DataReq+"/"+NameFic, "w") as ficRes: #+"-"+ipc    
                                        ficRes.write(data2)
                                nbFiles +=1
                                print (ficRes.name, 'file written, ', Trouves,' patents expected and ', Total, ' cumulative.' )
                                add_spliter_result(directory, ficRes.name, str(AN), Trouves)
                                set_spliter_cumulative(directory, Total)
                        if Trouves == 0:
                            ipcOk = False
                            jourOk = False
                            #nothing to do 
                        if Trouves >= 2000:
                            monthOk = True
                            jourOk = True
                            # bad days for EPO... we need to split again    
                            # last solution IPC splitting
                            # for that day only
                            for ipc in IPC:
                                Request3 = Request2 + " AND IC=" + ipc
                                Trouves = checkRequest(Request3)
                                if Trouves>2000:
                                    print ("thats bad... the request : " + Request3 + " should be splitted and the limits of this script are reached")
                                    break
                                Total += Trouves
                                data2 = data.replace("***requete***", Request3)
                                data2 = data2.replace("***dataDir***", DataDir+str(AN)+mois+jour+ipc)
                                if NameFic not in lstFicOk:
                                    with open(DataReq+"/"+str(AN)+mois+'-'+jour+'-'+ipc+'Request.cql', "w") as ficRes: #+"-"+ipc    
                                            ficRes.write(data2)
                                    nbFiles +=1
                                    print (ficRes.name, 'file written, ', Trouves,' patents expected and ', Total, ' cumulative.' )
                                    add_spliter_result(directory, ficRes.name, str(AN), Trouves)
                                    set_spliter_cumulative(directory, Total)

    set_spliter_result_end(directory)                        
    print ("[request_spliter] request splitted in ", nbFiles, " files")
                
    print ("[request_spliter] Gathering with P2N all this request should lead to ", Total, " patents")