示例#1
0
def dedup():
    existing = [
        i['id'] for i in csv_items('images.csv') if 'edmond' in i['source_url']
    ]
    d = Deduplicator(existing)
    visit('cn/images.csv', d)
    print(d.count)
示例#2
0
from __future__ import unicode_literals, print_function

from tsammalexdata.util import visit


class Visitor(object):
    def __call__(self, index, row):
        return row[:2] + ['synonyms' if index == 0 else ''] + row[2:]


if __name__ == '__main__':
    visit('taxa.csv', Visitor())
示例#3
0
def select(p):
    shutil.copy(data_file('cn', p), data_file('cn', 'staged_images.csv'))
    visit('cn/staged_images.csv', Selector())
    print(
        len(open(data_file('cn', 'staged_images.csv')).read().split('\n')) - 1)
示例#4
0
def rewrite(p):
    visit('cn/' + p, JSON2CSV(data_file('cn', 'images.json')))
示例#5
0
def do_check(fname):
    existing = {(i['taxa__id'], i['tags']): i
                for i in csv_items('images.csv')
                if 'edmond' in i['source_url']}
    visit(fname, RemoveUploaded(existing))
示例#6
0
def rewrite(p):
    visit('cn/' + p, JSON2CSV(data_file('cn', 'images.json')))
示例#7
0
def select(p):
    shutil.copy(data_file('cn', p), data_file('cn', 'staged_images.csv'))
    visit('cn/staged_images.csv', Selector())
    print(len(open(data_file('cn', 'staged_images.csv')).read().split('\n')) - 1)
示例#8
0
def do_check(fname):
    existing = {(i['taxa__id'], i['tags']): i for i in
                csv_items('images.csv') if 'edmond' in i['source_url']}
    visit(fname, RemoveUploaded(existing))
示例#9
0
def dedup():
    existing = [i['id'] for i in csv_items('images.csv') if 'edmond' in i['source_url']]
    d = Deduplicator(existing)
    visit('cn/images.csv', d)
    print(d.count)
示例#10
0

class Visitor(object):
    def __init__(self):
        self.cols = {}
        with open(data_file('images_md.json'), 'rb') as fp:
            self.md = json.load(fp)
        self.count = 0

    def __call__(self, index, row):
        if index == 0:
            self.cols = {col: i for i, col in enumerate(row)}
            return row

        url = URL(row[self.cols['src']])
        try:
            for filename in url.path_segments():
                if filename in self.md:
                    if self.md[filename].get('source_url'):
                        row[self.cols['source']] = self.md[filename]['source_url']
                        self.count += 1
                        break
        except IndexError:
            pass
        return row


if __name__ == '__main__':
    v = Visitor()
    visit('images.csv', v)
    print(v.count)
示例#11
0
        self.edmond_urls = file_urls(data_file('Edmond.xml'))
        self.cols = {}
        self.count = 0

    def __call__(self, index, row):
        if index == 0:
            self.cols = {col: i for i, col in enumerate(row)}
            return row

        _id = row[self.cols['id']]

        if _id in self.edmond_urls:
            row[self.cols['source_url']] = self.edmond_urls[_id]['full']
            self.count += 1
        else:
            #
            # FIXME: check whether source_url is an Edmond image URL, if not, upload the
            # image to Edmond, insert the URL here! Depends on the imeji API being
            # available on Edmond.
            #
            print(_id, row)
        return row


if __name__ == '__main__':
    with open(data_file('Edmond.xml'), 'w', encoding='utf8') as fp:
        fp.write(requests.get(URL).text)
    v = Visitor()
    visit(sys.argv[1] if len(sys.argv) > 1 else 'images.csv', v)
    print(v.count)
示例#12
0
    Corresponding items in the Tsammalex collection on Edmond are detected by matching
    the id of the image against filename or checksum attribute of the Edmond item.
    """
    def __init__(self):
        self.edmond_urls = file_urls(data_file('Edmond.xml'))
        self.cols = {}

    def __call__(self, index, row):
        if index == 0:
            self.cols = {col: i for i, col in enumerate(row)}
            return row

        _id = row[self.cols['id']]

        if _id in self.edmond_urls:
            row[self.cols['source_url']] = self.edmond_urls[_id]['full']
        else:
            #
            # FIXME: check whether source_url is an Edmond image URL, if not, upload the
            # image to Edmond, insert the URL here! Depends on the imeji API being
            # available on Edmond.
            #
            print(_id, row)
        return row


if __name__ == '__main__':
    with open(data_file('Edmond.xml'), 'w', encoding='utf8') as fp:
        fp.write(requests.get(URL).text)
    visit(sys.argv[1] if len(sys.argv) > 1 else 'images.csv', Visitor())