def clean_patents(source_path, destination_path, mixed_destination_path, csv_eu_path):
    """ clean_patents """
    patent_data = pd.DataFrame(columns=['filename', 'applicant', 'citations'])
    csv_dataframe = xmlh.load_index_csv(csv_eu_path)
    for index, path in enumerate(source_path):
        files = fh.get_list_files(path, 'xml')
        for i in tqdm(range(len(files))):
            try:
                path_filename = files[i]
                # print("file: ", path_filename)
                parsed_xml = et.parse(path_filename)
                patent_document = parsed_xml.getroot()

                # country, date, doc_n, dtd_ver, file, id, kind, lang, status
                attributes = patent_document.attrib
                lang = attributes['lang'].upper()
                kind = attributes['kind'].upper()

                if lang == 'EN':
                    filename = attributes['file']
                    region = filename[:2].upper()
                    if region == 'EP':
                        eu_parser(patent_document, attributes, filename, kind, destination_path[index], mixed_destination_path[index], csv_dataframe, patent_data)
                    elif region == 'US':
                        us_parser(patent_document, filename, kind, destination_path[index])
            except:
                print("WARNING!!!! Check out the patent: ", path_filename)
                continue
Пример #2
0
def load_path_data(source_path, destination_path):
    print("seeking for patents ...")
    patents = []
    for index, path in enumerate(source_path):
        patent_data = pd.DataFrame(columns=['file_path'])
        for file_path in fh.get_list_files(path, 'xml'):
            patent_data.loc[patent_data.shape[0] + 1] = [file_path]
        patents.append((patent_data, destination_path[index], path))
    return patents
def handle_complete_args(source_path, folder_level):
    source_path = fh.link_paths(source_path, '*')
    source_path = fh.get_list_files(source_path, None)
    source_path = list(map(lambda path : path + '/', source_path))
    print("source path: %s" % source_path)
    folder_level = int(folder_level)
    print("folder destination level: %s" % folder_level)

    if len(source_path) == 0 or source_path[len(source_path)-1][-5:-1] == '.xml':
        return source_path_warnings()
    return source_path, folder_level
def handle_partial_args(source_path):
    source_path = fh.link_paths(source_path, '*')
    source_path = fh.get_list_files(source_path, None)
    source_path = list(map(lambda path : path + '/', source_path))
    print("source path: %s" % source_path)

    if len(source_path) == 0 or source_path[len(source_path)-1][-5:-1] == '.xml':
        return th.source_path_warnings()
    else:
        folder_level = source_path[0].count('/')-1
        print("folder destination level: %s" % folder_level)
        return source_path, folder_level
Пример #5
0
def load_data_2(source_path):
    print('###  reading patents  ###')
    """ load_data_2 """
    data_frame = pd.DataFrame(columns=['abstract', 'claim', 'description', 'classification'])
    classifications_df = pd.DataFrame(columns=['class', 'count'])
    patent_ids = []
    for path in source_path:
        for patent_index, path_filename in enumerate(fh.get_list_files(path, 'txt')):
            file = open(path_filename, "r")

            patent_ids.append(th.get_patent_id(path_filename))
            kind = get_txt_text(file, 1).strip()

            if kind == 'A1':
                classcode = get_txt_text(file, 1).strip()
                applicant = get_txt_text(file, 1).strip()

                abstract = get_txt_text(file, 1).strip()
                citations = get_txt_text(file, 1).strip()
                file.close()

                fill_dataframe(data_frame, classifications_df, classcode, abstract, None, None, patent_index)
            elif kind == 'B1':
                classcode = get_txt_text(file, 1).strip()
                id_respective_document = get_txt_text(file, 1).strip()

                # abstract = get_txt_text(file, 1).strip()
                claim = get_txt_text(file, 1).strip()
                description = get_txt_text(file, 1).strip()
                # citations = get_txt_text(file, 1).strip()
                file.close()

                fill_dataframe(data_frame, classifications_df, classcode, None, claim, description, patent_index)
            else:
                if kind == 'A1B1':
                    print("eu_mix_patent !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
                else:
                    print("us_patent     !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
                classcode = get_txt_text(file, 1).strip()
                applicant = get_txt_text(file, 1).strip()

                abstract = get_txt_text(file, 1).strip()
                claim = get_txt_text(file, 1).strip()
                description = get_txt_text(file, 1).strip()
                citations = get_txt_text(file, 1).strip()
                file.close()

                fill_dataframe(data_frame, classifications_df, classcode, abstract, claim, description,patent_index)

    data_frame['id'] = data_frame.index
    classifications_df.sort_values(by=['count'], ascending=False, inplace=True, kind='quicksort')
    return patent_ids, data_frame, classifications_df
def explore_patents(source_path, destination_path):
    """ explore_patents """
    patent_index = pd.DataFrame(columns=['file', 'country', 'kind', 'date', 'path'])
    for index, path in enumerate(source_path):
        files = fh.get_list_files(path, 'xml')
        # for i in tqdm(range(len(files))):
        for path_filename in files:
            try:
                f = open(path_filename, "rb")
                patent_file = TextIOWrapper(BytesIO(f.read()))

                patents = get_patents(patent_file.read(), int(re.search("([0-9]{4})", path).group()))

                # results = [(process_patent(patents[i][0], patents[i][1], destination_path[index], path, patent_index)) for i in tqdm(range(len(patents)))]
                # alternative
                results = list(map(lambda i : (process_patent(patents[i][0], patents[i][1], destination_path[index], path, patent_index)), tqdm(range(len(patents)))))
            except:
                print("WARNING!!!! Check out the file: ", path_filename)
                continue
    xmlh.write_index(patent_index, script_key)
Пример #7
0
def apply_method_for_reading_patents_1_2(data_frame, classifications_df, path):
    return [apply_method_for_reading_patents_2(data_frame, classifications_df, patent_index, path_filename) for patent_index, path_filename in enumerate(fh.get_list_files(path, 'txt'))]
Пример #8
0
def handle_path_patent(data_frame, classifications_df, path):
    return list(map(lambda path_filename : handle_patent_file(data_frame, classifications_df, path_filename), fh.get_list_files(path, 'txt')))
Пример #9
0
def explore_patents(source_path, destination_path):
    """ explore_patents """
    patent_data = pd.DataFrame(
        columns=['file', 'country', 'kind', 'date', 'path'])
    for index, path in enumerate(source_path):
        files = fh.get_list_files(path, 'xml')
        for i in tqdm(range(len(files))):
            try:
                path_filename = files[i]
                parsed_xml = etree.parse(path_filename)
                patent_document = parsed_xml.getroot()

                # country, date, doc_n, dtd_ver, file, id, kind, lang, status
                attributes = patent_document.attrib
                if 'lang' in attributes and 'kind' in attributes and 'file' in attributes and 'date-publ' in attributes and 'country' in attributes:
                    lang = attributes['lang'].upper()
                    kind = attributes['kind'].upper()
                    file = attributes['file']
                    date = attributes['date-publ']
                    region = attributes['country']

                    country = get_alternative_country(patent_document)

                    classcode = get_alternative_classcode(patent_document)

                    if lang == 'EN' and classcode != "":
                        filename = th.get_eu_filename(
                            path_filename)  # LLDDDDDDDDLLLD.xml
                        if kind == 'A1':  # bibliografy - index
                            applicant, abstract, citations = a1_parser(
                                patent_document)
                            # add A1 to index if the abstract is not empty (and the respective B1 is not in the dataset)
                            if abstract != "":
                                xmlh.write_eu_a1_xml_patent(
                                    destination_path[index], filename, lang,
                                    kind, classcode, applicant, abstract,
                                    citations)

                                filenumber = get_file_number(file)
                                patent_data.loc[patent_data.shape[0] + 1] = [
                                    region + filenumber, country, kind, date,
                                    path
                                ]

                        elif kind == 'B1':  # text - data
                            id_respective_document = get_id_document(
                                region, file, kind)
                            # if the A1 has the abstract i do not need to save it here
                            if id_respective_document[:-4] in patent_data[
                                    'file']:
                                description, claim, abstract, citations = b1_parser(
                                    patent_document, False)
                            # if there is not, i have also to save the information of the patent
                            elif id_respective_document[:
                                                        -4] not in patent_data[
                                                            'file']:
                                description, claim, abstract, citations = b1_parser(
                                    patent_document, True)
                            xmlh.write_eu_b1_xml_patent(
                                destination_path[index], filename, lang, kind,
                                id_respective_document, classcode, abstract,
                                claim, description, citations)
                            filenumber = get_file_number(file)
                            patent_data.loc[patent_data.shape[0] + 1] = [
                                region + filenumber, country, kind, date, path
                            ]
                else:
                    print("WARNING!!! outlier - no patent document: ",
                          path_filename)
            except:
                print("WARNING!!!! Check out the patent: ", path_filename)
                continue
    xmlh.write_index(patent_data, script_key)