def getDataSetPatentColumn(data_set, df, columnName): dataSet = Datasets.objects.filter(name=data_set) patents = Patents.objects.filter(dataset_id=dataSet[0].id) df['ids'] = [d[0] for d in list(patents.values_list('id'))] classes = [] if(columnName == 'cpc' or columnName == 'clean_current_assignees'): for d in list(patents.values_list(columnName)): if(d[0]): classes.append(d[0].upper()) else: classes.append(None) else: for d in list(patents.values_list(columnName)): if(d[0]): classes.append(d[0].title()) else: classes.append(None) stringType = 'lines' for c in classes: if(c and '[' in c): stringType = 'list' break elif(c and ';' in c): stringType = 'semiColon' break if(stringType == 'list'): df[columnName] = catc.stringListToList(classes) elif(stringType == 'semiColon'): df[columnName] = catc.splitToList(classes, ';') else: df[columnName] = catc.splitLinesToList(classes) return df
def getDataSetPatentCPCs(data_set, df): dataSet = Datasets.objects.filter(name=data_set) patents = Patents.objects.filter(dataset_id=dataSet[0].id) df['ids'] = [d[0] for d in list(patents.values_list('id'))] CPCs = [d[0] for d in list(patents.values_list('cpc'))] hasSemiColon = False for c in CPCs: if(c and ';' in c): hasSemiColon = True break if(hasSemiColon): df['CPCs'] = catc.stringToList(CPCs, ';') else: df['CPCs'] = catc.splitLinesToList(CPCs) return df
def updateDataSetCategories(data_set, df, publicationNumberColumnName, categoryColumnName): df[categoryColumnName] = catc.stringListToList(df[categoryColumnName].tolist()) dataSet = Datasets.objects.filter(name=data_set) publicationNumbers = df[publicationNumberColumnName].tolist() categories = df[categoryColumnName].tolist() for publicationNumber, category in zip(publicationNumbers, categories): Patents.objects.filter(dataset_id=dataSet[0].id, publication_numbers=publicationNumber).update(predicted_categories=category)
def getTrainingSetPatents(training_set): trainingSet = TrainingDatasets.objects.filter(name=training_set) patents = TrainingPatents.objects.filter(training_datasets_id=trainingSet[0].id) df = pd.DataFrame() df['ids'] = [d[0] for d in list(patents.values_list('id'))] df['Titles'] = [d[0] for d in list(patents.values_list('titles'))] df['Abstracts'] = [d[0] for d in list(patents.values_list('abstracts'))] df['Independent Claims'] = [d[0] for d in list(patents.values_list('independent_claims'))] categories = [d[0] for d in list(patents.values_list('classification'))] if(('[' in categories[0] and ']' in categories[0]) or ('(' in categories[0] and ')' in categories[0])): df['Categories'] = catc.stringListToList(categories) else: df['Categories'] = catc.splitLinesToList(categories) return df
def getDataSetPatentTechnicalConcepts(data_set, df): dataSet = Datasets.objects.filter(name=data_set) patents = Patents.objects.filter(dataset_id=dataSet[0].id) df['ids'] = [d[0] for d in list(patents.values_list('id'))] technicalConcepts = [d[0] for d in list(patents.values_list('technical_concepts'))] df['Technical Concepts'] = catc.stringListToList(technicalConcepts) return df
def getDataSetPatentCPCDescriptions(data_set, df): dataSet = Datasets.objects.filter(name=data_set) patents = Patents.objects.filter(dataset_id=dataSet[0].id) df['ids'] = [d[0] for d in list(patents.values_list('id'))] CPCs = [d[0] for d in list(patents.values_list('cpc'))] df['CPC Descriptions'] = catc.splitLinesToList(CPCs) return df
def getDataSetPatentCategoryByKeywords(data_set, df): dataSet = Datasets.objects.filter(name=data_set) patents = Patents.objects.filter(dataset_id=dataSet[0].id) df['ids'] = [d[0] for d in list(patents.values_list('id'))] categoryByKeywords = [d[0] for d in list(patents.values_list('category_by_keywords'))] df['Category by Keywords'] = catc.stringListToList(categoryByKeywords) return df
def getDataSetPatentMarketSegments(data_set, df): dataSet = Datasets.objects.filter(name=data_set) patents = Patents.objects.filter(dataset_id=dataSet[0].id) df['ids'] = [d[0] for d in list(patents.values_list('id'))] marketSegments = [d[0] for d in list(patents.values_list('market_segment'))] df['Market Segments'] = catc.stringListToList(marketSegments) return df
def getTrainingSetPatentClassifications(training_set): trainingSet = TrainingDatasets.objects.filter(name=training_set) patents = TrainingPatents.objects.filter(training_datasets_id=trainingSet[0].id) categories = [d[0] for d in list(patents.values_list('classification'))] categories = catc.splitLinesToList(categories) allCategories = [] for categoryList in categories: allCategories = allCategories + categoryList return list(set(allCategories))
def getDataSetPatentPredictedCategories(data_set): dataSet = Datasets.objects.filter(name=data_set) patents = Patents.objects.filter(dataset_id=dataSet[0].id) df = pd.DataFrame() df['ids'] = [d[0] for d in list(patents.values_list('id'))] predictedCategories = [d[0] for d in list(patents.values_list('predicted_categories'))] df['Predicted Categories'] = catc.stringListToList(predictedCategories) return df
def getDataSetPatentAssignees(data_set, df): dataSet = Datasets.objects.filter(name=data_set) patents = Patents.objects.filter(dataset_id=dataSet[0].id) df['ids'] = [d[0] for d in list(patents.values_list('id'))] df['Current Assignees'] = [d[0] for d in list(patents.values_list('current_assignee'))] cleanAssignees = [d[0] for d in list(patents.values_list('clean_current_assignees'))] df['Clean Assignees List'] = catc.stringListToList(cleanAssignees) cleanAssigneesString = [str(d).replace("'", '').replace("[", '').replace("]", '') for d in cleanAssignees] df['Clean Assignees'] = cleanAssigneesString return df
def getDataSetPatentsBySource(df, sourceName): dataSets = Datasets.objects.filter(source=sourceName) ids = dates = titles = abstracts = CPCs = assignees = cleanAssignees = types = [] for dataSet in dataSets: patents = Patents.objects.filter(dataset_id=dataSet.id) ids = ids + [d[0] for d in list(patents.values_list('id'))] dates = dates + [d[0] for d in list(patents.values_list('year'))] titles = titles + [d[0] for d in list(patents.values_list('titles'))] abstracts = abstracts + [d[0] for d in list(patents.values_list('abstracts'))] CPCs = CPCs + [d[0] for d in list(patents.values_list('cpc'))] assignees = assignees + [d[0] for d in list(patents.values_list('current_assignee'))] cleanAssignees = cleanAssignees + [d[0] for d in list(patents.values_list('clean_current_assignees'))] types = types + [d[0] for d in list(patents.values_list('patent_type'))] df['ids'] = ids df['Dates'] = [str(datetime.datetime.strptime(d, '%B %d, %Y')).replace('-', '').split(' ')[0] for d in dates] df['Titles'] = titles df['Abstracts'] = abstracts df['CPCs'] = catc.splitLinesToList(CPCs) df['Types'] = types df['Current Assignees'] = assignees df['Clean Assignees List'] = catc.stringListToList(cleanAssignees) cleanAssigneesString = [str(d).replace("'", '').replace("[", '').replace("]", '') for d in cleanAssignees] df['Clean Assignees'] = cleanAssigneesString return df
def updateAssigneeGrouping(df, keywordsDF): cleanCAListList = [] CAListList = catc.splitLinesToList(df['Current Assignees'].tolist()) keywordsListList = keywordsDF['Keywords'].tolist() groups = keywordsDF['Group'].tolist() for CAList in CAListList: updatedCAList = [] for CA in CAList: if (CA and CA != 'NAN' and CA != 'nan'): CAWords = CA.lower().split() updatedCA = CA for keywordsList, group in zip(keywordsListList, groups): for keyword in keywordsList: for word in CAWords: if (keyword.lower() == word or keyword.lower() == CA.lower()): updatedCA = group break updatedCAList.append(updatedCA.upper()) updatedCAList = list(set(updatedCAList)) cleanCAListList.append(updatedCAList) df['Assignee Group'] = cleanCAListList return df
def getDataSetPatents(data_set): dataSet = Datasets.objects.filter(name=data_set) patents = Patents.objects.filter(dataset_id=dataSet[0].id) df = pd.DataFrame() df['ids'] = [d[0] for d in list(patents.values_list('id'))] df['Years'] = [d[0] for d in list(patents.values_list('year'))] df['Publication Numbers'] = [d[0] for d in list(patents.values_list('publication_numbers'))] df['Titles'] = [d[0] for d in list(patents.values_list('titles'))] df['Abstracts'] = [d[0] for d in list(patents.values_list('abstracts'))] df['Independent Claims'] = [d[0] for d in list(patents.values_list('independent_claims'))] technicalConcepts = [d[0] for d in list(patents.values_list('technical_concepts'))] df['Technical Concepts'] = catc.stringListToList(technicalConcepts) df['Main CPC'] = [d[0] for d in list(patents.values_list('main_cpc'))] df['Main CPC Description'] = [d[0] for d in list(patents.values_list('main_cpc_description'))] CPCs = [d[0] for d in list(patents.values_list('cpc'))] df['CPCs'] = catc.splitLinesToList(CPCs) CPCDescriptions = [d[0] for d in list(patents.values_list('cpc_descriptions'))] df['CPC Descriptions'] = catc.stringListToList(CPCDescriptions) df['Current Assignees'] = [d[0] for d in list(patents.values_list('current_assignee'))] df['Predicted Market Segments'] = [d[0] for d in list(patents.values_list('predicted_market_segments'))] df['Predicted Topics'] = [d[0] for d in list(patents.values_list('predicted_topic'))] df['Types'] = [d[0] for d in list(patents.values_list('patent_type'))] categories = [d[0] for d in list(patents.values_list('category'))] df['Categories'] = catc.stringListToList(categories) marketSegments = [d[0] for d in list(patents.values_list('market_segment'))] df['Market Segments'] = catc.stringListToList(marketSegments) categoryByKeywords = [d[0] for d in list(patents.values_list('category_by_keywords'))] df['Category by Keywords'] = catc.stringListToList(categoryByKeywords) predictedCategories = [d[0] for d in list(patents.values_list('predicted_categories'))] df['Predicted Categories'] = catc.stringListToList(predictedCategories) categories = [d[0] for d in list(patents.values_list('category2'))] df['Categories2'] = catc.stringListToList(categories) categories = [d[0] for d in list(patents.values_list('category3'))] df['Categories3'] = catc.stringListToList(categories) categories = [d[0] for d in list(patents.values_list('category4'))] df['Categories4'] = catc.stringListToList(categories) cleanAssignees = [d[0] for d in list(patents.values_list('clean_current_assignees'))] df['Clean Assignees List'] = catc.stringListToList(cleanAssignees) cleanAssigneesString = [str(d).replace("'", '').replace("[", '').replace("]", '') for d in cleanAssignees] df['Clean Assignees'] = cleanAssigneesString return df