Exemplo n.º 1
0
def iterlocations(filename):
    with UnicodeDictReader(filename,
                           dialect=sniff(filename,
                                         default_dialect=None)) as reader:
        # Identify fieldnames
        fieldnames = [(n.lower(), n) for n in reader.fieldnames]
        fieldmap = {}

        for field, aliases in [
            ('language identifier', _language_column_names),
            ('latitude', ("latitude", "lat")),
            ('longitude', ("longitude", "lon", "long")),
        ]:
            for lname, fieldname in fieldnames:
                if lname in aliases:
                    fieldmap[field] = fieldname
                    break
            else:
                raise ValueError(
                    "Could not find a {0} column in location data file {1}".
                    format(field, filename))

        for row in reader:
            (lat, lon) = row[fieldmap['latitude']], row[fieldmap['longitude']]
            try:
                lat = float(lat) if lat != "?" else lat
                lon = float(lon) if lon != "?" else lon
            except ValueError:
                lat, lon = "?", "?"
            yield (row[fieldmap['language identifier']].strip(), (lat, lon))
Exemplo n.º 2
0
def run(args):
    paths = {
        p.stem.split('-')[1]: p
        for p in args.repos.path('mappings').glob('map-*.tsv')
    }
    translate = {
        'Person/Thing': 'noun',
        'Other': 'other',
        'Number': 'numeral',
        'Action/Process': 'verb',
        'Property': 'adjective',
        'Classifier': 'classifier'
    }
    mappings = {}
    for language, path in paths.items():
        mappings[language] = collections.defaultdict(set)
        with UnicodeDictReader(path, delimiter='\t') as reader:
            for line in reader:
                gloss = line['GLOSS'].split('///')[1]
                oc = translate.get(
                    args.repos.conceptsets[line['ID']].ontological_category,
                    'other')
                cgl = args.repos.conceptsets[line['ID']].gloss
                mappings[language][gloss].add(
                    (line['ID'], cgl, int(line['PRIORITY']), oc, 1))
            for gloss in list(mappings[language].keys()):
                if gloss.lower() not in mappings[language]:
                    mappings[language][gloss.lower()] = set([
                        (x[0], x[1], x[2], x[3], 0)
                        for x in mappings[language][gloss]
                    ])

    for language, path in paths.items():
        for k, v in mappings[language].items():
            mappings[language][k] = sorted(v, key=lambda x: x[1], reverse=True)

    with zipfile.ZipFile(args.destination,
                         mode='w',
                         compression=zipfile.ZIP_DEFLATED) as myzip:
        myzip.writestr('concepticon.json', json.dumps(mappings))
Exemplo n.º 3
0
def load_data(filename,
              file_format=None,
              lang_column=None,
              value_column=None,
              expect_multiple=False):
    # Handle CSV dialect issues
    if str(filename) == 'stdin':
        filename = sys.stdin
        # We can't sniff from stdin, so guess comma-delimited and hope for
        # the best
        dialect = "excel"  # Default dialect for csv module
    elif file_format and file_format.lower() == "cldf":
        return read_cldf_dataset(filename,
                                 value_column,
                                 expect_multiple=expect_multiple)
    elif file_format and file_format.lower() == "cldf-legacy":
        # CLDF pre-1.0 standard says delimiter is indicated by file extension
        if filename.suffix.lower() == ".csv" or str(filename) == "stdin":
            dialect = "excel"
        elif filename.suffix.lower() == ".tsv":
            dialect = "excel-tab"
        else:
            raise ValueError(
                "CLDF standard dictates that filenames must end in .csv or .tsv"
            )
    elif filename.suffix == ".json" or filename.name in {
            "forms.csv", "values.csv"
    }:
        # TODO: Should we just let the pycldf module try its hands on the file
        # and fall back to other formats if that doesn't work?
        return read_cldf_dataset(filename,
                                 value_column,
                                 expect_multiple=expect_multiple)
    else:
        # Use CSV dialect sniffer in all other cases
        dialect = sniff(filename)
    # Read
    with UnicodeDictReader(filename, dialect=dialect) as reader:
        # Guesstimate file format if user has not been explicit
        if file_format is None:
            file_format = 'cldf-legacy' if all(
                [f in reader.fieldnames
                 for f in ("Language_ID", "Value")]) and any([
                     f in reader.fieldnames
                     for f in ("Feature_ID", "Parameter_ID")
                 ]) else 'beastling'

        # Load data
        if file_format == 'cldf-legacy':
            data = load_cldf_data(reader,
                                  value_column,
                                  filename,
                                  expect_multiple=expect_multiple)
        elif file_format == 'beastling':
            data = load_beastling_data(reader,
                                       lang_column,
                                       filename,
                                       expect_multiple=expect_multiple)
        else:
            raise ValueError(
                "File format specification '{:}' not understood".format(
                    file_format))
    return data, {}
Exemplo n.º 4
0
from pathlib import Path
from csvw.dsv import UnicodeDictReader
import zipfile
import json


def data_path(*path):
    return Path(__file__).parent.joinpath("data", *path)


with UnicodeDictReader(data_path("sense.csv"), delimiter=",") as reader:
    SENSE = {}
    for row in reader:
        SENSE[row["HEADWORD"]] = frozenset(row["ITEMS"].split(";")[:-1])


def get_Concepticon():
    with zipfile.ZipFile(data_path("concepticon.zip").as_posix(), "r") as zf:
        concepticon = json.loads(zf.read("concepticon.json"))
    return concepticon
Exemplo n.º 5
0
from pathlib import Path
from csvw.dsv import UnicodeDictReader


def data_path(*path):
    return Path(__file__).parent.joinpath('data', *path)


with UnicodeDictReader(data_path('hamnosys.tsv'), delimiter="\t") as reader:
    HAMNOSYS = {}
    for row in reader:
        HAMNOSYS[eval(r'"\u' + row['Unicode'] + '"')] = row
Exemplo n.º 6
0
from collections import OrderedDict, defaultdict
from csvw.dsv import UnicodeDictReader

# read simlex data
scores, simlex, conc = {}, {}, {}

with UnicodeDictReader('scores.csv') as reader:
    for row in reader:
        scores[row['ID']] = row
with UnicodeDictReader('translation.csv') as reader:
    for row in reader:
        simlex[row['ID']] = row

# read our data and merge those entries into the same gloss which have the same
# concepticon ID
mappings = defaultdict(list)
visited = set()
wrong_translations = []
with UnicodeDictReader('simlex-concepticon.tsv', delimiter="\t") as reader:
    for row in reader:
        for idx in row['SIMLEX_IDS'].split():
            conc[row['NUMBER']] = row
        idxs = row['SIMLEX_IDS'].split()
        for idx in idxs:
            if idx.endswith('1'):
                idx = idx[:-1] + '2'
            else:
                idx = idx[:-1] + '1'
            conc[idx] = row
        if row['CONCEPTICON_ID'].strip():
            gloss = row['CONCEPTICON_ID'] + '/' + row['CONCEPTICON_GLOSS']
Exemplo n.º 7
0
from csvw.dsv import UnicodeDictReader
from collections import defaultdict, OrderedDict
from pysen.glosses import to_concepticon

data = defaultdict(list)
with UnicodeDictReader("data.csv", delimiter=",") as reader:
    for i, row in enumerate(reader):
        w1, w2 = row["Target"], row["Relatum"]
        # we add the row to the data
        data[w1] += [(i + 1, row["Pair"].replace(' ', '-'), 0, w2, row)]
        data[w2] += [(i + 1, row["Pair"].replace(' ', '-'), 1, w1, row)]

# pos converter
poses = {"ADJ": "adjective", "NOUN": "noun", "VERB": "verb"}

# we need to get the sorted words to be able to number them
numbered = OrderedDict([(y, x + 1) for x, y in enumerate(sorted(data))])

with open('Scheible-2014.tsv', 'w') as f:
    f.write('\t'.join([
        "NUMBER", "GERMAN", "POS", "CONCEPTICON_ID", "CONCEPTICON_GLOSS",
        "IDS_IN_SOURCE", "LINKS", "RELATION_TYPE", "SCORES"
    ]) + "\n")
    # iterate over dat anow
    for word, number in numbered.items():
        values = data[word]
        indices = " ".join(
            ["{0}:{1}:{2}".format(x[0], x[1], x[2]) for x in values])
        links, scores, reltypes = [], [], []
        for idx, pair, pos, wordB, row in values:
            if pos == 0:
from csvw.dsv import UnicodeDictReader
from collections import defaultdict, OrderedDict
from pysen.glosses import to_concepticon

data = defaultdict(list)
with UnicodeDictReader("exp-ratings-EN.csv", delimiter=" ") as reader:
    for i, row in enumerate(reader):
        w1, w2 = row["Target"], row["Relatum"]
        # we add the row to the data
        data[w1] += [(i + 1, row["Pair"].replace(' ', '-'), 0, w2, row)]
        data[w2] += [(i + 1, row["Pair"].replace(' ', '-'), 1, w1, row)]

# pos converter
poses = {"ADJ": "adjective", "NOUN": "noun", "VERB": "verb"}

# we need to get the sorted words to be able to number them
numbered = OrderedDict([(y, x + 1) for x, y in enumerate(sorted(data))])

with open('Scheible-2014.tsv', 'w') as f:
    f.write('\t'.join([
        "NUMBER",
        "ENGLISH",
        "POS",
        "CONCEPTICON_ID",
        "CONCEPTICON_GLOSS",
        "IDS_IN_SOURCE",
        "LINKS",
        "POLYSEMY_CLASS",
        "FREQUENCY_CLASS",
        "DEGREE_CLASS",
        "RELATION_TYPE",