Пример #1
0
def strip_duplicates(values):
    norm_values, map = normalise_strings(values)
    
    new_values = []
    for value in norm_values:
        if value not in new_values:
            new_values.append(value)
            
    return denormalise_strings(new_values, map)
def detect_and_strip_duplicates(values):
    norm_values, map = normalise_strings(values)
    
    new_values = []
    for value in norm_values:
        if value not in new_values:
            new_values.append(value)
        else:
            print values, '. Duplicate was:', value
            
    return denormalise_strings(new_values, map)
Пример #3
0
def contains_non_canonical_org(data):
    """
    Return canonical form if so.
    """
    value = data.strip().lower()
    if not value:
    # it was only whitespace
        return False

    keys = canonical_org_forms.keys()
    
    norm_keys, map = normalise_strings(keys)
    
    for o in norm_keys:
        if o in value:
            return canonical_org_forms[map[value]]
        
    return False
Пример #4
0
metadata cleanup tasks
"""

from csvwrapper import normalise_strings, denormalise_strings

# Data shared between common functions and rules
known_organisations = ["Aston Business School", "Aston University", "Bangor University", "Barrow in Furness 6th Form College", "Bede College", "Bexley College", "Bishop Auckland College", "Blackburn College", "Blackburn University Centre", "Boston College", "Bournemouth University", "Bournemouth and Poole College", "Bournville College", "Bradford College", "Bradford University", "Braintree College", "Bristol University", "Brockenhurst College", "Brunel University", "Burton College", "CILIP", "Calderdale College", "Camberwell college of the arts", "Canterbury Christ Church University", "Cardiff Metropolitan University", "Cardiff University", "Central Saint Martins College of Art And Design", "Coleg Llandrillo Cymru", "Colorado School of Mines", "Core-Materials", "Coventry University", "Craven College", "Croyden College", "De Montfort University", "Deeside College", "Dipex", "Division for Lifelong Learning - University of Bath", "Doncaster College", "Dunstable College", "EDINA", "East Durham and Houghall College", "Edge Hill University", "Edinburgh Napier University", "Fuseworks", "Gateshead College", "Gateway College", "Glasgow Caledonian University", "Grimsby College", "Harlow College", "Hartlepool College", "Hartpury College", "Harvard Law School", "Henley College", "Hibernia College", "Highbury College", "Huddersfield College", "Hull College", "Imperial College London", "Imperial College, London", "Institution of Enterprise", "JISC", "Keele University", "King's College London", "Lancaster University", "Learning And Skills Network Ltd", "Leeds College of Building", "Leeds General University", "Leeds Metropolitan University", "Leicester College", "Leicester University", "Liverpool John Moores University", "London College of Communication", "London College of Fashion", "London Metropolitan University", "Loughborough University", "Mimas", "Manchester Metropolitan University", "Massey University", "Melbourne School of Population Health", "Middlesbrough College", "Morley College", "Nelson and Colne College", "Nescot College", "New College Nottingham", "New College Telford", "Newcastle College", "Newcastle Under Lyme College", "Newcastle University", "North Hertfordshire College", "North West London College", "Northampton College", "Northumbria University", "Nottingham University", "Oaklands College", "Open Educational Repository In Support of Computer Science", "Open University", "Oxford Brookes University", "Oxford University", "Penwith College", "Peterborough College", "Phg", "Phgk", "Plymouth University", "Priestley College", "Queen Mary University of London", "Queen's University Belfast", "Reading University", "Redcar & Cleveland College", "Regent College", "Regent's College", "Roehampton University", "Rolls-Royce University Technology Centre", "Rose Bruford College of Theatre and Performance", "Royal Holloway University", "Royal Society of Chemistry", "Royal Veterinary College", "Saylor Foundation", "Scotland's Colleges", "Sheffield Hallam University", "Somerset College of Arts & Technology", "South Devon College", "South East Essex VI Form College", "South Thames College", "Southampton Solent University", "Southport College", "St Brendan's 6th Form College", "St George's, University of London", "Staffordshire University", "Stamford College", "Stevenson College", "Stockton 6th Form College", "Stockton Riverside College", "Stoke on Trent 6th Form College", "Technical University of Denmark", "Teesside University", "Thames Valley University", "The Learning Bank", "The University of Liverpool", "Totton College", "Tyne Metropolitan College", "UCLAN", "University College Falmouth", "University College London", "University Federico II", "University Portsmouth", "University for the Creative Arts", "University of Aberdeen", "University of Ancona", "University of Bath", "University of Bedfordshire", "University of Birmingham", "University of Bolton", "University of Bradford", "University of Brighton", "University of Bristol", "University of British Columbia", "University of Cambridge", "University of Central Lancashire", "University of Cumbria", "University of Derby", "University of Dundee", "University of East London", "University of Edinburgh", "University of Exeter", "University of Ferrara", "University of Genoa", "University of Glamorgan", "University of Glasgow", "University of Gloucestershire", "University of Hertfordshire", "University of Hull", "University of Keele", "University of Leeds", "University of Leicester", "University of Leuven", "University of Lincoln", "University of Liverpool", "University of Manchester", "University of Minnesota", "University of New South Wales", "University of Northumbria", "University of Nottingham", "University of Oxford", "University of Padova", "University of Portsmouth", "University of Reading", "University of Sheffield", "University of Southampton", "University of Stockholm", "University of Strathclyde", "University of Surrey", "University of Ulster", "University of Wales, Newport", "University of Warwick", "University of Westminster", "University of Wolverhampton", "University of Worcester", "University of York", "University of the Arts London", "Varndean College", "Wakefield College", "West Hertforsdhire College", "West Kent College", "West Nottinghamshire College", "Weston College", "Winstanley College", "Worcester College of Technology", "Worcester University", "X4L Healthier Nation", "York St John University"]

# add \' versions for all organisations which have an apostrophe in the name
known_organisations_lookup = known_organisations[:]
for o in known_organisations_lookup:
    if "'" in o:
        known_organisations.append(o.replace("'", "\\'"))
        

norm_known_organisations, map_known_organisations = normalise_strings(known_organisations)

ignore_if_org_present = {
    'university of wales': 'University of Wales, Newport',
}

canonical_org_forms = {
    # non-canonical -> canonical mapping
    'University of Wales - Newport': 'University of Wales, Newport',
    'Stafforshire University': "Staffordshire University",
    "Fuseworks": "Fusedworks",
    'bradford managment school': "Bradford University",
    "Bradford Management School": "Bradford University",
    "Department of Criminology Leicester University": "Leicester University",
    "Department of Materials Science and Metallurgy, University of Cambridge": "University of Cambridge",
    "School of Geography, University of Leeds": "University of Leeds",