def getDataSetPatentColumn(data_set, df, columnName):
    dataSet = Datasets.objects.filter(name=data_set)
    patents = Patents.objects.filter(dataset_id=dataSet[0].id)
    df['ids'] = [d[0] for d in list(patents.values_list('id'))]
    classes = []
    if(columnName == 'cpc' or columnName == 'clean_current_assignees'):
        for d in list(patents.values_list(columnName)):
            if(d[0]):
                classes.append(d[0].upper())
            else:
                classes.append(None)        
    else:
        for d in list(patents.values_list(columnName)):
            if(d[0]):
                classes.append(d[0].title())
            else:
                classes.append(None)
    stringType = 'lines'
    for c in classes:
        if(c and '[' in c):
            stringType = 'list'
            break
        elif(c and ';' in c):
            stringType = 'semiColon'
            break
    if(stringType == 'list'):
        df[columnName] = catc.stringListToList(classes)
    elif(stringType == 'semiColon'):
        df[columnName] = catc.splitToList(classes, ';')
    else:
        df[columnName] = catc.splitLinesToList(classes)
    return df
def getDataSetPatentCPCs(data_set, df):
    dataSet = Datasets.objects.filter(name=data_set)
    patents = Patents.objects.filter(dataset_id=dataSet[0].id)
    df['ids'] = [d[0] for d in list(patents.values_list('id'))]
    CPCs = [d[0] for d in list(patents.values_list('cpc'))]
    hasSemiColon = False
    for c in CPCs:
        if(c and ';' in c):
            hasSemiColon = True
            break
    if(hasSemiColon):
        df['CPCs'] = catc.stringToList(CPCs, ';')
    else:
        df['CPCs'] = catc.splitLinesToList(CPCs)
    return df
def updateDataSetCategories(data_set, df, publicationNumberColumnName, categoryColumnName):
    df[categoryColumnName] = catc.stringListToList(df[categoryColumnName].tolist())
    dataSet = Datasets.objects.filter(name=data_set)
    publicationNumbers = df[publicationNumberColumnName].tolist()
    categories = df[categoryColumnName].tolist()
    for publicationNumber, category in zip(publicationNumbers, categories):
        Patents.objects.filter(dataset_id=dataSet[0].id, publication_numbers=publicationNumber).update(predicted_categories=category)
def getTrainingSetPatents(training_set):
    trainingSet = TrainingDatasets.objects.filter(name=training_set)
    patents = TrainingPatents.objects.filter(training_datasets_id=trainingSet[0].id)
    
    df = pd.DataFrame()
    df['ids'] = [d[0] for d in list(patents.values_list('id'))]
    df['Titles'] = [d[0] for d in list(patents.values_list('titles'))]
    df['Abstracts'] = [d[0] for d in list(patents.values_list('abstracts'))]
    df['Independent Claims'] = [d[0] for d in list(patents.values_list('independent_claims'))]
    categories = [d[0] for d in list(patents.values_list('classification'))]
    if(('[' in categories[0] and ']' in categories[0]) or ('(' in categories[0] and ')' in categories[0])):
        df['Categories'] = catc.stringListToList(categories)
    else:
        df['Categories'] = catc.splitLinesToList(categories)

    return df
def getDataSetPatentTechnicalConcepts(data_set, df):
    dataSet = Datasets.objects.filter(name=data_set)
    patents = Patents.objects.filter(dataset_id=dataSet[0].id)
    df['ids'] = [d[0] for d in list(patents.values_list('id'))]
    technicalConcepts = [d[0] for d in list(patents.values_list('technical_concepts'))]
    df['Technical Concepts'] = catc.stringListToList(technicalConcepts)
    return df
def getDataSetPatentCPCDescriptions(data_set, df):
    dataSet = Datasets.objects.filter(name=data_set)
    patents = Patents.objects.filter(dataset_id=dataSet[0].id)
    df['ids'] = [d[0] for d in list(patents.values_list('id'))]
    CPCs = [d[0] for d in list(patents.values_list('cpc'))]
    df['CPC Descriptions'] = catc.splitLinesToList(CPCs)
    return df
def getDataSetPatentCategoryByKeywords(data_set, df):
    dataSet = Datasets.objects.filter(name=data_set)
    patents = Patents.objects.filter(dataset_id=dataSet[0].id)
    df['ids'] = [d[0] for d in list(patents.values_list('id'))]
    categoryByKeywords = [d[0] for d in list(patents.values_list('category_by_keywords'))]
    df['Category by Keywords'] = catc.stringListToList(categoryByKeywords)
    return df
def getDataSetPatentMarketSegments(data_set, df):
    dataSet = Datasets.objects.filter(name=data_set)
    patents = Patents.objects.filter(dataset_id=dataSet[0].id)
    df['ids'] = [d[0] for d in list(patents.values_list('id'))]
    marketSegments = [d[0] for d in list(patents.values_list('market_segment'))]
    df['Market Segments'] = catc.stringListToList(marketSegments)
    return df
def getTrainingSetPatentClassifications(training_set):
    trainingSet = TrainingDatasets.objects.filter(name=training_set)
    patents = TrainingPatents.objects.filter(training_datasets_id=trainingSet[0].id)
    categories = [d[0] for d in list(patents.values_list('classification'))]
    categories = catc.splitLinesToList(categories)
    allCategories = []
    for categoryList in categories:
        allCategories = allCategories + categoryList
    return list(set(allCategories))
def getDataSetPatentPredictedCategories(data_set):
    dataSet = Datasets.objects.filter(name=data_set)
    patents = Patents.objects.filter(dataset_id=dataSet[0].id)
    
    df = pd.DataFrame()
    df['ids'] = [d[0] for d in list(patents.values_list('id'))]
    predictedCategories = [d[0] for d in list(patents.values_list('predicted_categories'))]
    df['Predicted Categories'] = catc.stringListToList(predictedCategories)
    return df
def getDataSetPatentAssignees(data_set, df):
    dataSet = Datasets.objects.filter(name=data_set)
    patents = Patents.objects.filter(dataset_id=dataSet[0].id)
    df['ids'] = [d[0] for d in list(patents.values_list('id'))]
    df['Current Assignees'] = [d[0] for d in list(patents.values_list('current_assignee'))]
    cleanAssignees = [d[0] for d in list(patents.values_list('clean_current_assignees'))]
    df['Clean Assignees List'] = catc.stringListToList(cleanAssignees)
    cleanAssigneesString = [str(d).replace("'", '').replace("[", '').replace("]", '') for d in cleanAssignees]
    df['Clean Assignees'] = cleanAssigneesString
    return df
def getDataSetPatentsBySource(df, sourceName):
    dataSets = Datasets.objects.filter(source=sourceName)
    ids = dates = titles = abstracts = CPCs = assignees = cleanAssignees = types = []
    for dataSet in dataSets:
        patents = Patents.objects.filter(dataset_id=dataSet.id)        
        ids = ids +  [d[0] for d in list(patents.values_list('id'))]
        dates = dates + [d[0] for d in list(patents.values_list('year'))]
        titles = titles +  [d[0] for d in list(patents.values_list('titles'))]
        abstracts = abstracts + [d[0] for d in list(patents.values_list('abstracts'))]
        CPCs = CPCs + [d[0] for d in list(patents.values_list('cpc'))]
        assignees = assignees + [d[0] for d in list(patents.values_list('current_assignee'))]
        cleanAssignees = cleanAssignees + [d[0] for d in list(patents.values_list('clean_current_assignees'))]
        types = types + [d[0] for d in list(patents.values_list('patent_type'))]
    df['ids'] = ids
    df['Dates'] = [str(datetime.datetime.strptime(d, '%B %d, %Y')).replace('-', '').split(' ')[0] for d in dates]
    df['Titles'] = titles
    df['Abstracts'] = abstracts
    df['CPCs'] = catc.splitLinesToList(CPCs)
    df['Types'] = types
    df['Current Assignees'] = assignees
    df['Clean Assignees List'] = catc.stringListToList(cleanAssignees)
    cleanAssigneesString = [str(d).replace("'", '').replace("[", '').replace("]", '') for d in cleanAssignees]
    df['Clean Assignees'] = cleanAssigneesString
    return df
def updateAssigneeGrouping(df, keywordsDF):
    cleanCAListList = []
    CAListList = catc.splitLinesToList(df['Current Assignees'].tolist())
    keywordsListList = keywordsDF['Keywords'].tolist()
    groups = keywordsDF['Group'].tolist()
    for CAList in CAListList:
        updatedCAList = []
        for CA in CAList:
            if (CA and CA != 'NAN' and CA != 'nan'):
                CAWords = CA.lower().split()
                updatedCA = CA
                for keywordsList, group in zip(keywordsListList, groups):
                    for keyword in keywordsList:
                        for word in CAWords:
                            if (keyword.lower() == word
                                    or keyword.lower() == CA.lower()):
                                updatedCA = group
                                break
                updatedCAList.append(updatedCA.upper())
        updatedCAList = list(set(updatedCAList))
        cleanCAListList.append(updatedCAList)
    df['Assignee Group'] = cleanCAListList
    return df
def getDataSetPatents(data_set):
    dataSet = Datasets.objects.filter(name=data_set)
    patents = Patents.objects.filter(dataset_id=dataSet[0].id)
    
    df = pd.DataFrame()
    df['ids'] = [d[0] for d in list(patents.values_list('id'))]
    df['Years'] = [d[0] for d in list(patents.values_list('year'))]
    df['Publication Numbers'] = [d[0] for d in list(patents.values_list('publication_numbers'))]
    df['Titles'] = [d[0] for d in list(patents.values_list('titles'))]
    df['Abstracts'] = [d[0] for d in list(patents.values_list('abstracts'))]
    df['Independent Claims'] = [d[0] for d in list(patents.values_list('independent_claims'))]
    technicalConcepts = [d[0] for d in list(patents.values_list('technical_concepts'))]
    df['Technical Concepts'] = catc.stringListToList(technicalConcepts)
    df['Main CPC'] = [d[0] for d in list(patents.values_list('main_cpc'))]
    df['Main CPC Description'] = [d[0] for d in list(patents.values_list('main_cpc_description'))]
    CPCs = [d[0] for d in list(patents.values_list('cpc'))]
    df['CPCs'] = catc.splitLinesToList(CPCs)
    CPCDescriptions = [d[0] for d in list(patents.values_list('cpc_descriptions'))]
    df['CPC Descriptions'] = catc.stringListToList(CPCDescriptions)
    df['Current Assignees'] = [d[0] for d in list(patents.values_list('current_assignee'))]
    df['Predicted Market Segments'] = [d[0] for d in list(patents.values_list('predicted_market_segments'))]
    df['Predicted Topics'] = [d[0] for d in list(patents.values_list('predicted_topic'))]
    df['Types'] = [d[0] for d in list(patents.values_list('patent_type'))]
    
    categories = [d[0] for d in list(patents.values_list('category'))]
    df['Categories'] = catc.stringListToList(categories)
    marketSegments = [d[0] for d in list(patents.values_list('market_segment'))]
    df['Market Segments'] = catc.stringListToList(marketSegments)
    categoryByKeywords = [d[0] for d in list(patents.values_list('category_by_keywords'))]
    df['Category by Keywords'] = catc.stringListToList(categoryByKeywords)
    predictedCategories = [d[0] for d in list(patents.values_list('predicted_categories'))]
    df['Predicted Categories'] = catc.stringListToList(predictedCategories)

    categories = [d[0] for d in list(patents.values_list('category2'))]
    df['Categories2'] = catc.stringListToList(categories)
    categories = [d[0] for d in list(patents.values_list('category3'))]
    df['Categories3'] = catc.stringListToList(categories)
    categories = [d[0] for d in list(patents.values_list('category4'))]
    df['Categories4'] = catc.stringListToList(categories)
    cleanAssignees = [d[0] for d in list(patents.values_list('clean_current_assignees'))]
    df['Clean Assignees List'] = catc.stringListToList(cleanAssignees)
    cleanAssigneesString = [str(d).replace("'", '').replace("[", '').replace("]", '') for d in cleanAssignees]
    df['Clean Assignees'] = cleanAssigneesString
    return df