Python LoadBiblioFile示例，Patent2Net.P2N_Lib.LoadBiblioFile Python示例

示例#1

0

显示文件

文件： OPSGatherContentsV2-Images.py 项目： Patent2net/P2N-v3

ResultPathImages = configFile.ResultPathImages
P2NFamilly = configFile.GatherFamilly

if IsEnableScript:
    ops_client = epo_ops.Client(key, secret)
    ops_client.accept_type = 'application/json'

    prefixes = [""]
    if P2NFamilly:
        prefixes.append("Families")

    for prefix in prefixes:
        ndf = prefix + configFile.ndf

        try:
            biblio_file = LoadBiblioFile(ResultBiblioPath, ndf)
        except IOError as ex:
            print('WARNING: Could not load information for "{}". Not found / error: {}'.format(ndf, ex))

        patents = biblio_file['brevets']
        metadata = {}
        Num = len(patents)
        cpt = 0
        for patent in patents:
            cpt += 1
            AnnonceProgres (Appli='p2n_image', valMax=100, valActu=cpt*90/Num) # 10% are expected in fusion image
            patent_label = get_patent_label(patent)
            pathes = []
            path_json = '{}//{}.json'.format(ResultPathImages, patent_label)
            path_image = '{}//{}-{}.tiff'.format(ResultPathImages, patent_label, '{}')
            print("Processing patent {}".format(patent_label))

示例#2

0

显示文件


Inventeurs = set()
Applicants = set()

AnnonceLog(Appli='p2n_network', texte='Net processing is starting ')
if configFile.GatherFamilly:
    PU = [ndf, 'Families' + ndf]
else:
    PU = [ndf]
for fic in PU:

    print("\n> Hi! This is Net processor used on:", fic)
    if 'Description' + fic in os.listdir(ResultBiblioPath):
        with open(ResultBiblioPath + '//' + fic, 'r') as data:
            dico = LoadBiblioFile(ResultBiblioPath, fic)
    else:  # Retrocompatibility
        print("please use Comptatibilizer")
        sys.exit()
    LstBrevet = dico['brevets']

    for bre in LstBrevet:
        if isinstance(bre['label'], list):
            # if len(bre['label']) >1:
            if len(bre['label']) != len(set(bre['label'])):
                AnnonceLog(
                    Appli='p2n_network',
                    texte=
                    'Good, two labels for same patent fixing to first one ' +
                    str(bre["label"]))
                #print ("two labels for same patent fixing to first one" , bre ["label"] )

示例#3

0

显示文件

        return dico
    else:
        return dico


if GatherFamilly:
    print("\n> Hi! This is the family gatherer. Processing ", ndf)
    try:

        fic = open(ResultPath + '//' + ndf, 'rb')

        print("loading data file ", ndf + ' from ', ResultPath, " directory.")
        if 'Description' + ndf in os.listdir(
                ResultPath
        ):  # NEW 12/12/15 new gatherer append data to pickle file in order to consume less memory
            data = LoadBiblioFile(ResultPath, ndf)

        else:  #Retrocompatibility :-)
            print("gather your data again")
            sys.exit()
        if isinstance(data, collections.Mapping):
            ListeBrevet = data['brevets']
            print("Found ", len(ListeBrevet), " patents gathered.")
        else:
            print(
                'data corrupted. Do something (destroying data directory is a nice idea)'
            )
            sys.exit()
        print(len(ListeBrevet), " patents loaded from file.")
        print("Augmenting list with families.")
        ficOk = True

示例#4

0

显示文件

ResultBiblioPath = configFile.ResultBiblioPath
temporPath = configFile.temporPath
ResultGephiPath = configFile.ResultGephiPath

ResultPathContent = configFile.ResultContentsPath
ResultAbstractPath = configFile.ResultAbstractPath
Auteur = configFile.ResultPath + '//AcadCorpora'
RepDir = configFile.ResultPath + '//AcadCorpora'
project = RepDir

if 'Description' + ndf in os.listdir(
        BiblioPath
):  # NEW 12/12/15 new gatherer append data to pickle file in order to consume less memory
    print("loading patent biblio data with ", " and ".join(NeededInfo),
          " fields.")
    DataBrevet = LoadBiblioFile(BiblioPath, ndf)
    print("Hi this is AcadStats processor. Bibliographic data of ", ndf,
          " patent universe found.")
else:
    print("relancez P2n pour collecter les données brevet")
    sys.exit()

print("Nice, ", len(DataBrevet["brevets"]),
      " patents found. On calcule les auteurs identifiés...")

# def Nettoie(Liste):
#     indesirables = ['', u'', None, False, [], ' ', "?", "Empty", "empty"]
#     Liste = [' '.join([truc.lower().title() for truc in nom.split(' ')]) for nom in Liste ]
#     return list(filter(lambda x: x not in indesirables, Liste))

# Analyse stat des résultats

示例#5

0

显示文件

文件： IPC-Abstracts-Augment.py 项目： Polo6767/P2N-v3

requete = configFile.requete
ndf = configFile.ndf
 #should set a working dir one upon a time... done it is temporPath
ResultPath = configFile.ResultBiblioPath
temporPath = configFile.temporPath
ResultContentsPath = configFile.ResultContentsPath
ResultBiblioPath = configFile.ResultBiblioPath
ResultPathContent = '..//DATA//'+ndf+'//PatentContents'

#Setting wether or not we use only primary classification
Primar = True
#Setting cache for performance purposes
CIB = dict()

if 'Description'+ndf or 'Description'+ndf.lower() in os.listdir(ResultBiblioPath): # NEW 12/12/15 new gatherer append data to pickle file in order to consume less memory
    ficBrevet = LoadBiblioFile(ResultBiblioPath, ndf)

else: #Retrocompatibility
    print('gather your data again. sorry')
    sys.exit()

if 'brevets' in ficBrevet:
    lstBrevet = ficBrevet['brevets']
#        if data.has_key('requete'):
#            DataBrevet['requete'] = data["requete"]
    print("Found  datafile with ", len(lstBrevet), " patents!")
else:
    print('gather your data again')
    sys.exit()

cles =  ['IPCR11', 'CitO', 'dateDate', 'inventor-nice', 'equivalents', 'CitedBy', 'representative', 'Inventor-Country', 'date', 'inventor', 'kind', 'priority-active-indicator', 'applicant-nice', 'IPCR1', 'country', 'IPCR3', 'applicant', 'IPCR4', 'IPCR7', 'title', 'application-ref']

示例#6

0

显示文件

文件： CompleteFusion.py 项目： Polo6767/P2N-v3

BiblioRes["brevets"] = []
BiblioRes["number"] = 0
BiblioRes["requete"] = ''
try:
    os.makedirs(ResultFolder + '//PatentBiblios')
except:
    if res.title() in lstReq:
        lstReq[0].remove(res.title())
    pass

#biblioFiles
for ndf in lstReq:
    lstBrevets2, nbTrouves = [], 0
    if ndf in os.listdir('..//DATA//') and ndf in os.listdir(
            '..//DATA//' + ndf + '//PatentBiblios//'):
        Brevet1 = LoadBiblioFile('..//DATA//' + ndf + '//PatentBiblios//', ndf)
        print("Doing ", ndf, "Found ", len(Brevet1["brevets"]),
              "patents in list")

    BiblioRes["brevets"] = BrevetFusion(Brevet1["brevets"],
                                        BiblioRes["brevets"])
    BiblioRes["number"] = len(BiblioRes["brevets"])
    if len(BiblioRes["requete"]) > 0:
        BiblioRes[
            "requete"] = Brevet1["requete"] + " UNION " + BiblioRes["requete"]
    else:
        BiblioRes["requete"] = Brevet1["requete"]

for brevet in BiblioRes["brevets"]:
    with open(ResultFolder + '//PatentBiblios//' + res, 'ab') as ficRes:
        pickle.dump(brevet, ficRes)

示例#7

0

显示文件

Rep = configFile.ResultContentsPath
Bib = configFile.ResultBiblioPath

try:
    if os.getenv('DEBUG'):
        es = Elasticsearch(hosts=[{'host': "127.0.0.1", 'port': 9200}]) # this works only in debug mode
        # elastic is reach by docker inter dns name in the image as below
    else:
        es = Elasticsearch(hosts=[{'host': "elasticsearch", 'port': 9200}])
except:
    es = Elasticsearch(hosts=[{'host': "elasticsearch", 'port': 9200}])


if 'Description' + ndf in os.listdir(
        Bib):  # NEW 12/12/15 new gatherer append data to pickle file in order to consume less memory
    DataBrevet = LoadBiblioFile(Bib, ndf)
    LstBrevet = DataBrevet['brevets']
elif 'Description' + ndf.title() in os.listdir(
        Bib):  # NEW 12/12/15 new gatherer append da
    # ta to pickle file in order to consume less memory
    DataBrevet = LoadBiblioFile(Bib, ndf.title())
    LstBrevet = DataBrevet['brevets']
else:  # Retrocompatibility
    print("please use Comptatibilizer")


def GenereListeFichiers(rep):
    """ prend un dossier en paramètre (chemin absolu) et génère la liste
    complète des fichiers TXT de l'arborescence"""
    listeFicFR = []
    listeFicEN = []

示例#8

0

显示文件

        #u'CitedBy',     # the list of docs (patents) cititng this patent
        #'CitP',         # the patents cited by this patent
        #'CitO'          # the other docs cited by this patent
    ]  #"citations"

    #filterFile = [fi for fi in os.listdir(ListBiblioPath) if fi.count('Expanded')]
    srcFile = [
        fi for fi in os.listdir(ListBiblioPath) if '.pkl' not in fi
        and 'tempoInconnus' not in fi and "Description" not in fi
    ]

    for ndf in srcFile:
        if 'Description' + ndf in os.listdir(
                ListBiblioPath
        ):  # NEW 12/12/15 new gatherer append data to pickle file in order to consume less memory
            DataBrevet = LoadBiblioFile(ListBiblioPath, ndf)
            print("\n> Hi! This is FormateExportPivotTable")
        else:  #Retrocompatibility... prévious test is ugly: there is an issue with filename in lowercase (sometimes)
            print("please use Comptatibilizer")
            DataBrevet = LoadBiblioFile(ListBiblioPath,
                                        ndf)  #so I try to laod it....

        if isinstance(DataBrevet, collections.Mapping):
            #data = DataBrevet
            LstBrevet = DataBrevet['brevets']
            if 'number' in DataBrevet:
                print("Found ", DataBrevet["number"],
                      " patents! Formating into HMTL Pivot tables")
            else:
                print("Found ", len(DataBrevet["brevets"]),
                      " patents! Trying to format into HMTL Pivot tables")

示例#9

0

显示文件

def read(path, slot):

    filename = 'Description' + slot
    if filename in os.listdir(path):
        data = LoadBiblioFile(path, slot)
        return data

示例#10

0

显示文件

文件： OPSGatherPatentsv2.py 项目： Patent2net/P2N-v3

        else:
            pass

AnnonceLog(Appli='p2n_gather_biblio',
           texte="Found almost" + str(len(lstBrevets)) +
           " patents. Saving list")
AnnonceLog(Appli='p2n_gather_biblio',
           texte="Within " + str(len(set(listeLabel))) + " unique patents")
print("Found almost", len(lstBrevets), " patents. Saving list")
print("Within ", len(set(listeLabel)), " unique patents")
BibliDataBrevets = dict()
BibliDataBrevets['brevets'] = []

# loading already gathered bibliographic daata
if ndf in os.listdir(ResultBiblioPath):
    BibliDataBrevets = LoadBiblioFile(ResultBiblioPath, ndf)
    #        with codecs.open(ResultBiblioPath + '//' + ndf, 'rb', "utf-8") as fic:
    #            while 1:
    #                try:
    #                    DataBrevets['brevets'].append(byteify(pickle.load(fic)))
    #                except EOFError:
    #                    break

    if len(BibliDataBrevets['brevets']) == len(listeLabel):
        print(len(BibliDataBrevets['brevets']),
              " bibliographic patent data gathered yet? ")
        GatherBibli = False

        AnnonceProgres(Appli='p2n_gather_biblio', valMax=100, valActu=100)
        sys.exit('Nothing else to do :-). Good bye')
    else:

示例#11

0

显示文件

文件： FusionImages.py 项目： Patent2net/P2N-v3

def run():

    # Bootstrap logging
    boot_logging()

    # Load configuration
    config = LoadConfig()

    # Run this only if enabled
    if not config.GatherImages:
        return

    # Get some information from configuration
    expression = config.requete
    storage_basedir = config.ResultBiblioPath
    storage_dirname = config.ndf
    output_path = config.ResultPathImages

    # Compute prefixes
    prefixes = [""]
    if config.GatherFamilly:
        prefixes.append("Families")

    # Build maps for all prefixes
    for prefix in prefixes:

        # Status message
        label = label_from_prefix(prefix)
        logger.info("Generating gallery of drawings for {}. ".format(label))

        # Compute storage slot using prefix and DataDirectory
        # e.g. "Lentille" vs. "FamiliesLentille"
        storage_name = prefix + storage_dirname

        # Load bibliographic data
        biblio_file = LoadBiblioFile(storage_basedir, storage_name)

        # Generate thumbnails
        gallery = []
        patents = biblio_file['brevets']
        cpt = 0
        for patent in patents:
            cpt + 1
            AnnonceProgres(Appli='p2n_image',
                           valMax=100,
                           valActu=90 + cpt * 10 / len(patents))
            patent_label = get_patent_label(patent)
            i = 1
            logger.info('Processing patent {}'.format(patent_label))
            path_img_base = '{}//{}-{}.tiff'.format(output_path, patent_label,
                                                    '{}')
            path = path_img_base.format(i)
            while os.path.exists(path):
                thumb, orig, tiff = generate_thumbnails(path)
                gallery.append({
                    "_id": '{}-{}'.format(patent_label, i),
                    'thumb': thumb,
                    'orig': orig,
                    'label': patent['title'],
                    'ipcr7': patent['IPCR7'],
                    'code': patent_label,
                    'tiff': tiff,
                })
                i += 1
                path = path_img_base.format(i)

        # Render gallery
        AnnonceProgres(Appli='p2n_image', valMax=100, valActu=100)
        RenderTemplate(
            'ModeleImages.html',
            output_path + '/index' + prefix + '.html',
            request=expression.replace('"', ''),
            gallery=gallery,
            json=json.dumps(gallery),
        )

示例#12

0

显示文件

文件： FusionCarrot2.py 项目： Polo6767/P2N-v3

if IsEnableScript:
    Rep = configFile.ResultContentsPath
    Bib = configFile.ResultBiblioPath

    prefixes = [""]
    if GatherFamilly:
        prefixes.append("Families")

    for prefix in prefixes:
        ndf = prefix + configFile.ndf

        if 'Description' + ndf in os.listdir(
                Bib
        ):  # NEW 12/12/15 new gatherer append data to pickle file in order to consume less memory
            DataBrevet = LoadBiblioFile(Bib, ndf)
            LstBrevet = DataBrevet['brevets']
        else:  #Retrocompatibility
            print("please use Comptatibilizer")

        try:
            os.makedirs(Rep + "//Carrot2")
        except:
            #directory exists
            pass
        temporar = GenereListeFichiers(Rep)
        cpt = 0
        for det in ['Abstract', 'Claims', 'Description']:
            ind = 0
            cpt += 1
            for lang in ['FR', 'EN', 'UNK']:

示例#13

0

显示文件

文件： FormateExportBiblio.py 项目： Polo6767/P2N-v3

        'kind', 'applicant', 'country', 'inventor', 'representative', 'IPCR4',
        'IPCR7', "Inventor-Country", "Applicant-Country", "equivalents", "CPC",
        'references', 'Citations', 'CitedBy'
    ]

    prefixes = [""]
    if GatherFamilly:
        prefixes.append("Families")

    for prefix in prefixes:
        ndf = prefix + configFile.ndf

        if 'Description' + ndf in os.listdir(
                ListBiblioPath
        ):  # NEW 12/12/15 new gatherer append data to pickle file in order to consume less memory
            LstBrevet = LoadBiblioFile(ListBiblioPath, ndf)
            with open(ListBiblioPath + '//Description' + ndf, 'rb') as ficRes:
                DataBrevet = pickle.load(ficRes)
        else:  #Retrocompatibility
            with open(ListBiblioPath + '//' + ndf, 'rb') as data:
                LstBrevet = pickle.load(data)

        ##next may need clarifying update

        data = LstBrevet
        LstBrevet = data['brevets']
        if 'requete' in data:
            requete = data["requete"]
        if 'number' in data:
            print("Found ", data["number"],
                  " patents! Formating to HMTL tables")

示例#14

0

显示文件

文件： P2N-PreNetworks.py 项目： Polo6767/P2N-v3

        NeededInfo.extend(mixNet)  # list of needed field for building the net
        # may be should use  from
        # from collections import OrderedDict
        # class OrderedNodeGraph(nx.Graph):
        #   node_dict_factory=OrderedDict
        # G = OrderedNodeGraph()
        G1 = nx.MultiDiGraph()  # Multi edges directed network for Gephi
        attr_dict = dict()  # attributes for the net
        # flat net for gexf.js may be it is possible to use previous instead of this one...

        if 'Description' + ndf in os.listdir(
                BiblioPath
        ):  # NEW 12/12/15 new gatherer append data to pickle file in order to consume less memory
            print(network, ": loading data with ", " and ".join(mixNet),
                  " fields.")
            DataBrevet = LoadBiblioFile(BiblioPath, ndf)
            print("Hi this is Pre-Network processor. Bibliographic data of ",
                  ndf, " patent universe found.")
        else:  #Retrocompatibility
            print("please use Comptatibilizer")

        print("Nice, ", len(DataBrevet["brevets"]),
              " patents found. Pre-formating ", sys.argv[1], " net.")
        for brev in DataBrevet["brevets"]:
            #tempo = pickle.load(fic) # we only memorize needed nfo
            pat = OrderedDict()
            if "date" not in list(brev.keys()):
                brev['date'] = '1-1-1'
            if isinstance(brev['label'], list):
                brev['label'] = brev['label'][0]
            for key in NeededInfo:

示例#15

0

显示文件

文件： OPSGatherPatentsv2.py 项目： Polo6767/P2N-v3

#listeLabel = []
# Entering PatentBiblio feeding
print("Checking and/or gathering bibliographic data")
if GatherBibli and GatherBiblio:
    for brevet in lstBrevets:
        # nameOfPatent for file system save (abstract, claims...)
        ndb = brevet['document-id']['country']['$'] + brevet['document-id'][
            'doc-number']['$']
        listeLabel.append(ndb)
    print("Found almost", len(lstBrevets), " patents. Saving list")
    print("Within ", len(set(listeLabel)), " unique patents")
    DataBrevets = dict()
    DataBrevets['brevets'] = []
    if ndf in os.listdir(ResultBiblioPath):
        DataBrevets = LoadBiblioFile(ResultBiblioPath, ndf)
        #        with codecs.open(ResultBiblioPath + '//' + ndf, 'rb', "utf-8") as fic:
        #            while 1:
        #                try:
        #                    DataBrevets['brevets'].append(byteify(pickle.load(fic)))
        #                except EOFError:
        #                    break

        if len(DataBrevets['brevets']) == len(listeLabel):
            print(len(DataBrevets['brevets']),
                  " bibliographic patent data gathered yet? ")
            GatherBibli = False
            sys.exit('Nothing else to do :-). Good bye')
        else:
            print(
                len(listeLabel) - len(DataBrevets['brevets']),

示例#16

0

显示文件

文件： preProcessNormalisationNames.py 项目： Polo6767/P2N-v3

nbAppliAvant = dict()
nbInvAvant = dict()
# traitement des fichiers + familles
if GatherFamilly:
    PU = [ndf, 'Families' + ndf]
else:
    PU = [ndf]
for fic in PU:
    cptInv, cptAppl = 0, 0

    print(
        "\n> Hi! This is Pre Process for normalizing applicant names: used on:",
        fic)
    if 'Description' + fic in os.listdir(ListBiblioPath):
        with open(ListBiblioPath + '//' + fic, 'r', encoding="utf8") as data:
            dico = LoadBiblioFile(ListBiblioPath, fic)
    else:  # Retrocompatibility
        print("please use Comptatibilizer")
        sys.exit()
    LstBrevet = dico['brevets']
    print("Good, ", len(LstBrevet),
          " patents found filterd from equivalent unicity")
    # patent filtering process
    Filtres = []
    dejaVus = []
    for bre in LstBrevet:
        if bre['label'] not in dejaVus:
            dejaVus.append(bre['label'])
            Filtres.append(bre)
            if isinstance(bre['equivalents'], list):
                for eq in bre['equivalents']:

示例#17

0

显示文件

文件： SplitCorpus2.py 项目： Polo6767/P2N-v3

temporPath = configFile.temporPath
ResultPathContent = configFile.ResultContentsPath
ResultAbstractPath = configFile.ResultAbstractPath
ListBiblioPath = configFile.ResultBiblioPath
# special path used with AcadPubMed.py
Auteur = configFile.ResultPath + '//AcadCorpora'
RepDir = configFile.ResultPath + '//AcadCorpora'
project = RepDir
if 'AcadCorpora' not in os.listdir(configFile.ResultPath):
    print("relancez le script de collecte (AcadPubMed.py 29/06/2019)")
    sys.exit()
if 'Description' + ndf in os.listdir(
        BiblioPath
):  # NEW 12/12/15 new gatherer append data to pickle file in order to consume less memory
    print("loading patent biblio data with all fields.")
    DataBrevet = LoadBiblioFile(BiblioPath, ndf)
    print(
        "Hi this is AcadStatsAcad Corpora splitter processor. Bibliographic data of ",
        ndf, " patent universe found.")
else:
    print("relancez P2n pour collecter les données brevet")
    sys.exit()

print(
    "Nice, ", len(DataBrevet["brevets"]),
    " patents found. Découpage selon le données du tablea EntitésPubliquesNorm.xlsx"
)

# test de consistance
# with open(Auteur+'//DejaTraites.csv', 'r',) as fic:
#     DejaVus = fic.readlines()

示例#18

0

显示文件

        "Inventor-Country",
        "Applicant-Country",
        "equivalents",
        "CPC",
        'prior-Date',  #'prior-dateDate', # dates of priority claims
        'references',  # the number of refences into the document len(CitP) + len(CitO)
        'Citations',  # the number of citations granted by the document
        'CitedBy',  # the list of docs (patents) cititng this patent
        'CitP',  # the patents cited by this patent
        'CitO'  # the other docs cited by this patent
    ]  #"citations"

    if 'Description' + ndf in os.listdir(
            ListBiblioPath
    ):  # NEW 12/12/15 new gatherer append data to pickle file in order to consume less memory
        dico = LoadBiblioFile(ListBiblioPath, ndf)

    else:  #Retrocompatibility
        with open(ListBiblioPath + '//' + ndf, 'r') as data:
            dico = pickle.load(data)

    LstBrevet = dico['brevets']
    if 'requete' in dico:
        requete = dico["requete"]
    if 'number' in dico:
        print("Found ", dico["number"], " patents! Formating to HMTL tables")
    else:
        print("Found ", len(LstBrevet), " patents! Formating to HMTL tables")
    LstExp = []
    LstExp2 = []
    #just for testing las fnction in gathered should deseapear soon

示例#19

0

显示文件

ResultListPath = configFile.ResultListPath
ResultBiblioPath = configFile.ResultBiblioPath

if IsEnableScript:
    LoadDescs()

    prefixes = [""]
    if P2NFamilly:
        prefixes.append("Families")

    for prefix in prefixes:
        ndf = prefix + configFile.ndf
        try:
            with open(ResultBiblioPath + '//' + ndf, 'r') as fic:
                DataBrevets1 = LoadBiblioFile(ResultBiblioPath, ndf)
                BrevetsTotal = str(len(DataBrevets1['brevets']))
        except:
            print("Error: there are no data to generate de FreePlane file")
        # End of Load patent file
        #

        ### ugly code to patch classification extraction inconsistency
        for bre in DataBrevets1['brevets']:
            if isinstance(bre['classification'], list):
                if '' in bre['classification']:
                    bre['classification'].remove('')
            bre['IPCR11'] = bre['classification']

            lstIPC = [ipc[0] for ipc in bre['classification']]
            for ipc in lstIPC:

示例#20

0

显示文件

文件： PatentListFiltering.py 项目： Polo6767/P2N-v3

def run():

    aujourd = datetime.date.today()

    configFile = LoadConfig()
    requete = configFile.requete
    ndf = configFile.ndf
    Gather = configFile.GatherContent
    GatherBiblio = configFile.GatherBiblio
    GatherPatent = configFile.GatherPatent
    GatherFamilly = configFile.GatherFamilly
    IsEnableScript = configFile.FormateExportDataTable

    #should set a working dir one upon a time... done it is temporPath
    ListBiblioPath = configFile.ResultBiblioPath
    temporPath = configFile.temporPath
    ResultPathContent = configFile.ResultPath

    ResultListPath = configFile.ResultListPath
    ResultBiblioPath = configFile.ResultBiblioPath
    # Lecture du fichier de référence

    lstApplic = []

    Inventeurs = []
    Applicants = []
    nbAppliAvant = dict()
    nbInvAvant = dict()

    # traitement des fichiers + familles
    if GatherFamilly:
        PU = [ndf, 'Families' + ndf]
    else:
        PU = [ndf]

    for fic in PU:
        if 'Old' + fic not in os.listdir(ResultBiblioPath):
            cptInv, cptAppl = 0, 0

            print(
                "\n> Hi! This is Pre Process for filtering equivalents patents from dataset gathered by P2N-OPSGather: used on:",
                fic)
            if 'Description' + fic in os.listdir(ListBiblioPath):
                with open(ListBiblioPath + '//' + fic, 'r',
                          encoding="utf8") as data:
                    dico = LoadBiblioFile(ListBiblioPath, fic)
            else:  # Retrocompatibility
                print("please use Comptatibilizer")
                sys.exit()
            LstBrevet = dico['brevets']
            # patent filtering process
            Filtres = []
            dejaVus = []
            LabBrevets = [brev['label'] for brev in LstBrevet]
            for bre in LstBrevet:  # parcours de la listee des brevets
                if bre['label'] not in dejaVus:  # si pas vu
                    dejaVus.append(bre['label'])  #rajout aux vus

                    # parcours des equivalents. Deux cas : une liste ou un chaine non vide
                    if isinstance(bre['equivalents'],
                                  list) and len(bre['equivalents']) > 0:
                        # récupération de la liste des dates  de chaque équvalents
                        dates = []
                        for brev in bre['equivalents']:
                            if brev in LabBrevets:  # si celui-ci fait partie des brevets de départ (sinon ignoré)
                                for brevet in LstBrevet:  # on va le chercher
                                    if brevet[
                                            'label'] == brev:  # yes c'est lui !!!
                                        if isinstance(
                                                brevet["date"], list
                                        ):  # les dates sont quelquefois en liste OU en chaine :-()
                                            date = min(brevet["date"])
                                        else:
                                            date = brevet["date"]
                                        # on rajoute à la structure adhoc : date, brevet, taille (nb de caractères)
                                        if len(date) < 4:
                                            print("Aille")
                                        dates.append((date, bre,
                                                      len(str(bre.values()))))
                                        #dates.extend((brevet["date"][0], brevet, len(str(brevet.values()))) for brevet in LstBrevet if brevet['label'] == brev )
                                dejaVus.append(brev)

                        if len(dates) == 1:  # pas d'ambiguité
                            Filtres.append(dates[0][1])
                        elif len(dates) > 1:  #récupération du plus vieux
                            MiniDate = min([dat for dat, brev, val in dates])
                            MaxVal = max(val for dat, brev, val in dates)
                            if len(MiniDate) < 5:
                                print(bre['prior-Date'], ' -- > ', MiniDate)
                            # giving priority to the first in date apparition maximizing lenght (su^^posed to be fields with information)
                            candidat = [
                                bre for dat, bre, val in dates
                                if dat == MiniDate and val == MaxVal
                            ]
                            if len(candidat
                                   ) == 0:  # ou au max d'apport informationnel
                                # if it doens't work giving priority to max information content
                                candidat = [
                                    brev for dat, brev, val in dates
                                    if val == MaxVal
                                ]
                                if len(candidat) > 1:
                                    priorDateMin = min([
                                        min(brevet["prior-Date"])
                                        for brevet in candidat
                                    ])
                                    NewCandidat = [
                                        brev for brev in candidat
                                        if priorDateMin in brev["prior-Date"]
                                    ]
                                    if len(NewCandidat) > 1:
                                        NewCandidat = NewCandidat[0]
                                        Filtres.append(NewCandidat)
                                    else:
                                        print("pffff")
                            else:  #aucun des équivalents dans la liste
                                Filtres.append(candidat[0])
                        else:
                            Filtres.append(bre)

                    elif isinstance(bre['equivalents'],
                                    str) and len(bre['equivalents']) > 0:
                        #len(bre ['equivalents'])>0 and bre ['equivalents'] in LabBrevets:
                        if bre['equivalents'] in LabBrevets:
                            brevet = [
                                brev for brev in LstBrevet
                                if brev['label'] == bre['equivalents']
                            ][0]
                            if isinstance(
                                    brevet["date"], list
                            ):  # les dates sont quelquefois en liste OU en chaine :-()
                                date = min(brevet["date"])
                            else:
                                date = brevet["date"]
                            if len(date) < 4:
                                print("Aille")

                            dates = [(date, brevet, len(brevet.values()))]

                            if isinstance(
                                    bre["date"], list
                            ):  # les dates sont quelquefois en liste OU en chaine :-()
                                date = min(bre["date"])
                            else:
                                date = bre["date"]

                            # joining currend patent
                            dates.append((date, bre, len(bre.values())))

                            MiniDate = min([dat for dat, bre, val in dates])
                            MaxVal = max([val for dat, bre, val in dates])
                            candidat = [
                                bre for dat, bre, val in dates
                                if dat == MiniDate and val == MaxVal
                            ]
                            if len(candidat) > 1:
                                pass
                            elif len(candidat) == 0:
                                # if it doens't work giving priority to max information content
                                candidat = [
                                    brev for dat, brev, val in dates
                                    if val == MaxVal
                                ]
                                if len(candidat) > 1:
                                    priorDateMin = min([
                                        min(brevet["prior-Date"])
                                        for brevet in candidat
                                    ])
                                    NewCandidat = [
                                        brev for brev in candidat
                                        if priorDateMin in brev["prior-Date"]
                                    ]
                                    if len(NewCandidat) > 1:
                                        NewCandidat = NewCandidat[0]
                                        Filtres.append(NewCandidat)
                                    else:
                                        print("pffff")
                            else:
                                Filtres.append(candidat[0])

                        else:  #equivalent pas dans le corpus
                            Filtres.append(bre)

                    else:
                        Filtres.append(bre)
                    for dat, brevet, val in dates:
                        dejaVus.append(brevet['label'])

            # joining lost patents
            LabFiltered = []
            for bre in Filtres:
                if isinstance(bre["label"], str):
                    LabFiltered.append(bre['label'])
                else:
                    LabFiltered.append(bre['label'][0])

            EquivFiltered = []
            for bre in Filtres:
                for pat in bre['equivalents']:
                    EquivFiltered.append(pat)

            complement = [bre for bre in LstBrevet \
                    if bre ['label'] not in LabFiltered \
                    and sum([eq in EquivFiltered for eq in bre["equivalents"]]) ==0]

            NewFilt = []
            DejaVus = []
            for bre in Filtres:
                if isinstance(bre['label'], list):
                    bre['label'] = bre['label'][0]
                if bre['label'] not in DejaVus:
                    equi = []
                    cpFilt = copy.copy(Filtres)
                    cpFilt.remove(bre)
                    for bre1 in cpFilt:
                        if isinstance(bre1['equivalents'], list):
                            for eq in bre1['equivalents']:
                                if len(eq) > 0 and eq != 'empty':
                                    equi.append(eq)
                        elif len(bre1['equivalents']
                                 ) > 1 and bre1['equivalents'] != 'empty':
                            equi.append(bre1['equivalents'])
                        else:
                            pass
                    if len(bre['equivalents']
                           ) > 0 and bre['equivalents'] != 'empty':
                        res = sum([pat in equi for pat in bre['equivalents']] +
                                  [bre['label'] in equi])
                    if res > 0:
                        tempo = [
                            (bre2['date'], bre2, len(bre2.values()))
                            for bre2 in cpFilt if bre['equivalents']
                        ]  # on pourrait direct aller là et tester sur la taille de tempo :-/
                        tempo2 = []
                        for dat, brevet, val in tempo:
                            if isinstance(dat, str):
                                tempo2.append((dat, brevet, val))
                            elif isinstance(dat, list):
                                for truc in dat:
                                    if len(truc) > 0:
                                        tempo2.append((truc, brevet, val))
                                    else:
                                        pass
                        tempo = tempo2
                        dates = []
                        valeurs = []
                        for dat, brevet, val in tempo:
                            dates.append(dat)
                            valeurs.append(val)
                        miniDate = min(dates)
                        maxVal = max(valeurs)
                        tempo2 = [
                            bre for dat, brevet, val in tempo
                            if dat == miniDate and val == maxVal
                        ]
                        if len(tempo2) > 0:
                            NewFilt.append(tempo2[0])
                            if isinstance(tempo2[0]['equivalents'], list):
                                for eq in tempo2[0]['equivalents']:
                                    DejaVus.append(eq)
                            elif tempo2[0]['equivalents'] != 'empty':
                                DejaVus.append(tempo2[0]['equivalents'])
                        else:
                            tempo2 = [
                                bre for dat, brevet, val in tempo
                                if val == maxVal
                            ]
                            if len(tempo2) > 0:
                                NewFilt.append(tempo2[0])
                                if isinstance(tempo2[0]['equivalents'], list):
                                    for eq in tempo2[0]['equivalents']:
                                        DejaVus.append(eq)
                                elif tempo2[0]['equivalents'] != 'empty':
                                    DejaVus.append(tempo2[0]['equivalents'])

                                else:
                                    pass
                            else:
                                NewFilt.append(bre)

                    else:
                        NewFilt.append(bre)
                    if isinstance(bre['label'], str):
                        DejaVus.append(bre['label'])
                    else:
                        for lab in bre['label']:
                            DejaVus.append(lab)
                    if isinstance(bre['equivalents'], list):
                        for eq in bre['equivalents']:
                            DejaVus.append(eq)
                    elif bre['equivalents'] != 'empty':
                        DejaVus.append(bre['equivalents'])
                else:
                    pass

            EquivFiltered = []
            cpFilt = copy.copy(Filtres)
            for bre in Filtres:
                if not isinstance(bre['label'], str):
                    if len(bre['label']) > 0:
                        bre['label'] = bre['label'][0]
                    else:
                        print("no label !!!!")
                else:
                    pass
            toRemove = []
            for bre in Filtres:
                if not isinstance(bre['equivalents'], list):
                    if len(bre['equivalents']
                           ) and bre['equivalents'] != 'empty':
                        bre['equivalents'] = [bre['equivalents']]
                    else:
                        bre['equivalents'] = []
                for pat in bre['equivalents']:
                    if pat != bre['label']:
                        if pat not in EquivFiltered:
                            EquivFiltered.append(pat)
                        else:
                            cpFilt = [
                                brev for brev in cpFilt
                                if brev['label'] != bre['label']
                            ]
                            toRemove.append((bre['label'], bre))
            exclude = [truc for truc, muche in toRemove]
            Resultat = []
            for bre in Filtres:
                if bre['label'] not in exclude:
                    Resultat.append(bre)

            EquivFiltered2 = []
            for bre in Resultat:
                for pat in bre['equivalents']:
                    EquivFiltered2.append(pat)

            print("net set of equivalent covered: ", len(EquivFiltered2))
            print(len(LstBrevet), '  --> ', len(Filtres), ' --> ',
                  len(Resultat))
            print("Good, ", len(Resultat + complement),
                  " patents filterd from equivalent unicity exrtracted from ",
                  fic)
            #Saving file
            for brev in Resultat:
                with open(ResultBiblioPath + '//tempo' + fic, 'ab') as ficRes:
                    pickle.dump(brev, ficRes)
            os.rename(ResultBiblioPath + '//' + fic,
                      ResultBiblioPath + '//Old' + fic)
            os.rename(ResultBiblioPath + '//tempo' + fic,
                      ResultBiblioPath + '//' + fic)

示例#21

0

显示文件

文件： Interface2.py 项目： Polo6767/P2N-v3

GatherBiblio = configFile.GatherBiblio
GatherPatent = configFile.GatherPatent
GatherFamilly = configFile.GatherFamilly

# should set a working dir one upon a time... done it is temporPath
ResultBiblioPath = configFile.ResultBiblioPath
ResultPatentPath = configFile.ResultListPath
ResultContentsPath = configFile.ResultContentsPath

GlobalPath = configFile.GlobalPath

# take request from BiblioPatent file

# NEW 12/12/15 new gatherer append data to pickle file in order to consume less memory
if 'Description' + ndf in os.listdir(ResultBiblioPath):
    data = LoadBiblioFile(ResultBiblioPath, ndf)
    requete = data['requete']
else:  # Retrocompatibility
    print("please use Comptatibilizer")
    # if 'Fusion' in data.keys()
    data = dict()
if GatherFamilly:  # pdate needed for families
    # NEW 12/12/15 new gatherer append data to pickle file in order to consume less memory
    if 'DescriptionFamilies' + ndf in os.listdir(ResultBiblioPath):
        data2 = LoadBiblioFile(ResultBiblioPath, 'Families' + ndf)
        nbFam = len(data2['brevets'])
    else:  # Retrocompatibility
        print("please use Comptatibilizer")
    # if 'Fusion' in data.keys()with open( ResultBiblioPath+'//Families'+ndf, 'r') as ficBib:
#        data2 = cPickle.load(ficBib)