def evaluate_cat_distr(thresh=None, nchars=4):
	"""
	Evaluates the distribution of CPC categories in the data set
	seperately for target patents (published in 2015)
	and the overall data set (patents published between 2000 and 2015)
	and plots the results in a histogram.
	Either a depth reduction or a threshold should be given!

	Inputs:
		- thresh: minimum number of patents a category should contain
				  in order to be considered (default: None)
		- nchars: number of characters that should be considered,
				  defines how deep to step into classification
				  (default: 4 --> e.g.: 'A61M', if set to None, analysis is
				  	very detailed --> e.g.: 'A61M2016/0039')

	"""
	session = load_session()
	# extract target patent set
	target_pats = session.query(Patent).filter(Patent.pub_date >= datetime.datetime(2015,1,1,0,0))
	# draw 10000 random patents from entire population including the target patents
	random_pats = session.query(Patent).order_by(func.random()).limit(10000)

	# get categories for the patents
	target_cats = [pat.category[:nchars] for pat in target_pats]
	target_cat_dict = {k: v for k, v in dict(collections.Counter(target_cats)).items() if v >= thresh}

	random_cats = [pat.category[:nchars] for pat in random_pats]
	random_cat_dict = {k: v for k, v in dict(collections.Counter(random_cats)).items() if v >= thresh}

	# plot category distribution
	fig = plt.figure()
	plt.bar(range(len(target_cat_dict.keys())), target_cat_dict.values())
	plt.xticks(range(len(target_cat_dict)), target_cat_dict.keys(), rotation=70, fontsize=5)
	plt.savefig('db_statistics/target_cat_distr_%s_%s.pdf' %(str(nchars), str(thresh)))
	plt.clf()

	fig = plt.figure()
	plt.bar(range(len(random_cat_dict.keys())), random_cat_dict.values())
	plt.xticks(range(len(random_cat_dict)), random_cat_dict.keys(), rotation=70, fontsize=5)
	plt.savefig('db_statistics/random_cat_distr_%s_%s.pdf' %(str(nchars), str(thresh)))
	plt.clf()
예제 #2
0
"""
This is a simple helper script for extracting some statistics
on the patents contained in the DB and plotting them
(e.g. average section length, etc.)
"""
import sqlalchemy
import random
import numpy as np
import collections
import matplotlib.pyplot as plt
from database.make_patent_db import load_session, Patent, Citation
from sqlalchemy.sql.expression import func
from nltk.tokenize import word_tokenize


session = load_session()

patent_sample = session.query(Patent).order_by(func.random()).limit(10000)
def calc_section_length():
    section_length = collections.defaultdict(list)
    for pat in patent_sample:
    	section_length['abstract'].append(len(word_tokenize(pat.abstract)))
    	section_length['claims'].append(len(word_tokenize(pat.claims)))
    	section_length['description'].append(len(word_tokenize(pat.description)))
    section_length_avg = {}
    for key in section_length:
        section_length_avg['abstract'] = np.mean(np.array(section_length['abstract']))
        section_length_avg['claims'] = np.mean(np.array(section_length['claims']))
        section_length_avg['description'] = np.mean(np.array(section_length['description']))
    np.save('db_statistics/section_length_avg.npy', section_length_avg)
    np.save('db_statistics/section_length.npy', section_length)
예제 #3
0
def sample_data(nrand=1000,
                date=datetime.datetime(2015, 1, 1, 0, 0),
                id_=None,
                cat=None):
    """
    Extract the target, cited and random patents from the DB

    Input:
        - nrand: Number of random patents to sample (default: 1000)
        - date: Threshold date to separate target and rest patents (default: 01/01/2015)
        - id_: To be set if the scores should be evaluated only for one target patent
               e.g. for the ones scored by patent attorney (default None)
        - cat: To be set if the scores should be evaluated only for a certain category
               e.g. 'A61M'(default None)

    Returns:
        - random_pats:
        - cited_pats:
        - target_pats:
        - dupl_pats:
        - cited_ids:
        - dupl_ids:
    """
    session = load_session()
    # all patents published after given date are considered as target patents
    target_pats = session.query(Patent).filter(Patent.pub_date >= date)
    rest_set = session.query(Patent).filter(Patent.pub_date < date)
    # if the scores are to be calculted for only one target patent
    if id_:
        print "evaluating simscores for patent %s" % id_
        target_pats = session.query(Patent).filter(Patent.id == id_)
    # if the scores should be evaluated only for a certain category
    elif cat:
        print "evaluating simscores for category %s" % cat
        cat_pats = session.query(Patent).filter(Patent.category.contains(cat))
        cat_pats_ids = [pat.id for pat in cat_pats]
        target_pats = cat_pats.filter(
            Patent.pub_date >= datetime.datetime(2015, 1, 1, 0, 0))
        # the random patents are sampled from the patents published before 2015
        rest_set = cat_pats.filter(
            Patent.pub_date < datetime.datetime(2015, 1, 1, 0, 0))
    else:
        print "evaluating for all target patents"
    # the random patents are sampled from the patents published before given date
    engine = session.get_bind()
    metadata = MetaData()
    metadata.bind = engine
    # create tables for cited and duplicate patents
    cited = Table(
        "cited", metadata,
        Column('id', String, ForeignKey(Patent.id), primary_key=True))
    duplicates = Table(
        "duplicates", metadata,
        Column('id', String, ForeignKey(Patent.id), primary_key=True))
    try:
        cited.drop()
        duplicates.drop()
    except:
        pass
    cited.create()
    duplicates.create()
    conn = engine.connect()
    # collect IDs for each target patent
    cited_ids = {}
    dupl_ids = {}
    ## get the duplicates and cited patents for all target patents
    print "getting duplicates and cited patents"
    for patent in target_pats:
        # get duplicate ids(read CSVs)
        with open(
                '/home/lea/Documents/master_thesis/patent_search/pats_2015_apa_lists/apa_list_%s.csv'
                % str(patent.id)) as apa_file:
            apa_list_reader = csv.reader(apa_file, delimiter='\t')
            duplicates_list = apa_list_reader.next()
            dupl_ids[patent.id] = [
                unicode(re.sub(' ', '', pat)) for pat in duplicates_list
            ]
        # get cited ids
        citations = session.query(Citation).filter(
            Citation.citing_pat == patent.id)
        cited_patents = []
        # check, if cited id is a duplicate
        for pat in citations:
            # if the simcoefs are to be evaluated only for a certain category
            if cat:
                # check if the cited pat is in the given category
                if pat.cited_pat not in cat_pats_ids:
                    continue
            if pat.cited_pat not in dupl_ids[patent.id]:
                cited_patents.append(pat.cited_pat)
        cited_ids[patent.id] = cited_patents
    ## fill tables with cited and duplicate patents
    print "filling tables cited and duplicates"
    # unite all cited and duplicate ids in one list
    all_cited_ids = []
    all_dupl_ids = []
    for pid in cited_ids.keys():
        # fill table with citations
        for cited_id in cited_ids[pid]:
            # check if id equals empty string, if so remove
            if cited_id == u'':
                cited_ids[pid].remove(cited_id)
            # insert patent into table
            else:
                try:
                    ins = cited.insert().values(id=cited_id)
                    conn.execute(ins)
                    all_cited_ids.append(cited_id)
                # error is thrown if patent is already in the DB
                except sqlalchemy.exc.IntegrityError, e:
                    continue
        # fill table with duplicates
        # get duplicate patents for the current target patent
        duplicate_pats = dupl_ids[pid]
        dupls_temp = []
        for dupl_id in duplicate_pats:
            # if the simcoefs are to be evaluated only for a certain category
            if cat:
                # check if the duplicate is in the category
                if pat.cited_pat not in cat_pats_ids:
                    continue
            # check if id equals empty string
            if dupl_id == u'':
                continue
            # check if the duplicate is already in the DB
            elif session.query(Patent).filter(
                    Patent.id == dupl_id).count() == 0:
                continue
            # insert duplicate patent into duplicates table
            else:
                try:
                    ins = duplicates.insert().values(id=dupl_id)
                    conn.execute(ins)
                    all_dupl_ids.append(dupl_id)
                    dupls_temp.append(dupl_id)
                # error is thrown if patent is already in the DB
                except sqlalchemy.exc.IntegrityError, e:
                    continue