def iterlocations(filename): with UnicodeDictReader(filename, dialect=sniff(filename, default_dialect=None)) as reader: # Identify fieldnames fieldnames = [(n.lower(), n) for n in reader.fieldnames] fieldmap = {} for field, aliases in [ ('language identifier', _language_column_names), ('latitude', ("latitude", "lat")), ('longitude', ("longitude", "lon", "long")), ]: for lname, fieldname in fieldnames: if lname in aliases: fieldmap[field] = fieldname break else: raise ValueError( "Could not find a {0} column in location data file {1}". format(field, filename)) for row in reader: (lat, lon) = row[fieldmap['latitude']], row[fieldmap['longitude']] try: lat = float(lat) if lat != "?" else lat lon = float(lon) if lon != "?" else lon except ValueError: lat, lon = "?", "?" yield (row[fieldmap['language identifier']].strip(), (lat, lon))
def run(args): paths = { p.stem.split('-')[1]: p for p in args.repos.path('mappings').glob('map-*.tsv') } translate = { 'Person/Thing': 'noun', 'Other': 'other', 'Number': 'numeral', 'Action/Process': 'verb', 'Property': 'adjective', 'Classifier': 'classifier' } mappings = {} for language, path in paths.items(): mappings[language] = collections.defaultdict(set) with UnicodeDictReader(path, delimiter='\t') as reader: for line in reader: gloss = line['GLOSS'].split('///')[1] oc = translate.get( args.repos.conceptsets[line['ID']].ontological_category, 'other') cgl = args.repos.conceptsets[line['ID']].gloss mappings[language][gloss].add( (line['ID'], cgl, int(line['PRIORITY']), oc, 1)) for gloss in list(mappings[language].keys()): if gloss.lower() not in mappings[language]: mappings[language][gloss.lower()] = set([ (x[0], x[1], x[2], x[3], 0) for x in mappings[language][gloss] ]) for language, path in paths.items(): for k, v in mappings[language].items(): mappings[language][k] = sorted(v, key=lambda x: x[1], reverse=True) with zipfile.ZipFile(args.destination, mode='w', compression=zipfile.ZIP_DEFLATED) as myzip: myzip.writestr('concepticon.json', json.dumps(mappings))
def load_data(filename, file_format=None, lang_column=None, value_column=None, expect_multiple=False): # Handle CSV dialect issues if str(filename) == 'stdin': filename = sys.stdin # We can't sniff from stdin, so guess comma-delimited and hope for # the best dialect = "excel" # Default dialect for csv module elif file_format and file_format.lower() == "cldf": return read_cldf_dataset(filename, value_column, expect_multiple=expect_multiple) elif file_format and file_format.lower() == "cldf-legacy": # CLDF pre-1.0 standard says delimiter is indicated by file extension if filename.suffix.lower() == ".csv" or str(filename) == "stdin": dialect = "excel" elif filename.suffix.lower() == ".tsv": dialect = "excel-tab" else: raise ValueError( "CLDF standard dictates that filenames must end in .csv or .tsv" ) elif filename.suffix == ".json" or filename.name in { "forms.csv", "values.csv" }: # TODO: Should we just let the pycldf module try its hands on the file # and fall back to other formats if that doesn't work? return read_cldf_dataset(filename, value_column, expect_multiple=expect_multiple) else: # Use CSV dialect sniffer in all other cases dialect = sniff(filename) # Read with UnicodeDictReader(filename, dialect=dialect) as reader: # Guesstimate file format if user has not been explicit if file_format is None: file_format = 'cldf-legacy' if all( [f in reader.fieldnames for f in ("Language_ID", "Value")]) and any([ f in reader.fieldnames for f in ("Feature_ID", "Parameter_ID") ]) else 'beastling' # Load data if file_format == 'cldf-legacy': data = load_cldf_data(reader, value_column, filename, expect_multiple=expect_multiple) elif file_format == 'beastling': data = load_beastling_data(reader, lang_column, filename, expect_multiple=expect_multiple) else: raise ValueError( "File format specification '{:}' not understood".format( file_format)) return data, {}
from pathlib import Path from csvw.dsv import UnicodeDictReader import zipfile import json def data_path(*path): return Path(__file__).parent.joinpath("data", *path) with UnicodeDictReader(data_path("sense.csv"), delimiter=",") as reader: SENSE = {} for row in reader: SENSE[row["HEADWORD"]] = frozenset(row["ITEMS"].split(";")[:-1]) def get_Concepticon(): with zipfile.ZipFile(data_path("concepticon.zip").as_posix(), "r") as zf: concepticon = json.loads(zf.read("concepticon.json")) return concepticon
from pathlib import Path from csvw.dsv import UnicodeDictReader def data_path(*path): return Path(__file__).parent.joinpath('data', *path) with UnicodeDictReader(data_path('hamnosys.tsv'), delimiter="\t") as reader: HAMNOSYS = {} for row in reader: HAMNOSYS[eval(r'"\u' + row['Unicode'] + '"')] = row
from collections import OrderedDict, defaultdict from csvw.dsv import UnicodeDictReader # read simlex data scores, simlex, conc = {}, {}, {} with UnicodeDictReader('scores.csv') as reader: for row in reader: scores[row['ID']] = row with UnicodeDictReader('translation.csv') as reader: for row in reader: simlex[row['ID']] = row # read our data and merge those entries into the same gloss which have the same # concepticon ID mappings = defaultdict(list) visited = set() wrong_translations = [] with UnicodeDictReader('simlex-concepticon.tsv', delimiter="\t") as reader: for row in reader: for idx in row['SIMLEX_IDS'].split(): conc[row['NUMBER']] = row idxs = row['SIMLEX_IDS'].split() for idx in idxs: if idx.endswith('1'): idx = idx[:-1] + '2' else: idx = idx[:-1] + '1' conc[idx] = row if row['CONCEPTICON_ID'].strip(): gloss = row['CONCEPTICON_ID'] + '/' + row['CONCEPTICON_GLOSS']
from csvw.dsv import UnicodeDictReader from collections import defaultdict, OrderedDict from pysen.glosses import to_concepticon data = defaultdict(list) with UnicodeDictReader("data.csv", delimiter=",") as reader: for i, row in enumerate(reader): w1, w2 = row["Target"], row["Relatum"] # we add the row to the data data[w1] += [(i + 1, row["Pair"].replace(' ', '-'), 0, w2, row)] data[w2] += [(i + 1, row["Pair"].replace(' ', '-'), 1, w1, row)] # pos converter poses = {"ADJ": "adjective", "NOUN": "noun", "VERB": "verb"} # we need to get the sorted words to be able to number them numbered = OrderedDict([(y, x + 1) for x, y in enumerate(sorted(data))]) with open('Scheible-2014.tsv', 'w') as f: f.write('\t'.join([ "NUMBER", "GERMAN", "POS", "CONCEPTICON_ID", "CONCEPTICON_GLOSS", "IDS_IN_SOURCE", "LINKS", "RELATION_TYPE", "SCORES" ]) + "\n") # iterate over dat anow for word, number in numbered.items(): values = data[word] indices = " ".join( ["{0}:{1}:{2}".format(x[0], x[1], x[2]) for x in values]) links, scores, reltypes = [], [], [] for idx, pair, pos, wordB, row in values: if pos == 0:
from csvw.dsv import UnicodeDictReader from collections import defaultdict, OrderedDict from pysen.glosses import to_concepticon data = defaultdict(list) with UnicodeDictReader("exp-ratings-EN.csv", delimiter=" ") as reader: for i, row in enumerate(reader): w1, w2 = row["Target"], row["Relatum"] # we add the row to the data data[w1] += [(i + 1, row["Pair"].replace(' ', '-'), 0, w2, row)] data[w2] += [(i + 1, row["Pair"].replace(' ', '-'), 1, w1, row)] # pos converter poses = {"ADJ": "adjective", "NOUN": "noun", "VERB": "verb"} # we need to get the sorted words to be able to number them numbered = OrderedDict([(y, x + 1) for x, y in enumerate(sorted(data))]) with open('Scheible-2014.tsv', 'w') as f: f.write('\t'.join([ "NUMBER", "ENGLISH", "POS", "CONCEPTICON_ID", "CONCEPTICON_GLOSS", "IDS_IN_SOURCE", "LINKS", "POLYSEMY_CLASS", "FREQUENCY_CLASS", "DEGREE_CLASS", "RELATION_TYPE",