Exemplo n.º 1
0
# encoding=utf-8
import jieba
import csv
import re

f = open('D:\\Shared\\Rawdata\\Split\\test\\splitBD.txt', 'a', encoding='UTF-8')
splitcsv = []
jieba.set_dictionary('dict.txt.big')
jieba.load_userdict('dictnew.txt')
with open('D:\\Shared\\Rawdata\\Raw\\BD.csv', encoding='UTF-8') as rawcsv:
    readCSV = csv.DictReader(rawcsv, delimiter=',')
    for row in readCSV:
        splitpost = []
        temp = row['article']
        rr = "[\s+\.\!\/_,$%^*(+\"\'\-]+|[+——!,。?、~@#¥%……&*()「」《》?:·).〈〉:/;◆■◇×=|°│─;“”\[\]→↓Nㄧˋ%\}\{\>\<’`÷‘±↑╱『˙<≠┤‘§€↑╱★ˇ←≧┐└‧+ˊ』>-~\ –ㄟ*※【】,、。.}{()╴—–|·‥…!?:;‧〔〕【】《》〈〉「」『』‘’“”☆◎▲△●○〃§※*&#′‵〞〝★◇◆□■▽▼㊣ˍ﹉﹊﹍﹎﹋﹌♀∕\/∣∥↘↙↗↖→←↓↑⊙⊕♂℅ ̄_+-×÷±√<>=≦≧≠∞$∴∵∮∫㏑㏒⊿∟∠⊥∪∩~≡≒¥〒¢£%€℃℉㏕㎝㎜㎞㏎㎡㎎㎏㏄°▁▂▃▄▅▆▆▇█▏▎▍▌▋▊▉┼┴┴┬┤├▔─│┌▕┐└┘╩╚╣╬╠╗╦╔╡╪╞═╯╰╮╭╝╒╤╕╘╧╛╓╥╖╟╫╢╙╨╜║▓◢◣◥◤╱╲╳˙ˉˊˇˋㄅㄆㄇㄈㄉㄊㄋㄌㄍㄎㄏㄐㄠㄟㄞㄝㄜㄛㄚㄙㄘㄗㄖㄕㄔㄓㄒㄑㄡㄢㄣㄤㄥㄦㄧㄨㄩ1&423567890qazxswedcvfrtgbnhyujmkiolp︱QAZXSWEDCVFRTGBNHYUJMKILOPⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩ@︰﹪﹣]+"
        article = re.sub(rr, "", temp)
        words = jieba.cut(article, cut_all=False, HMM=True)
        for word in words:
            f.write(word)
            f.write("\n")
f.close()
rawcsv.close()
Exemplo n.º 2
0
acom_list = []
for ad in acom_diagrams:
    acom_list.append(str(ad))
    all_architectures_list.append(ad)

all_architectures_list = sorted(all_architectures_list, key=lambda x: x.name)

parsed_articles = []

all_architectures_list = unique(all_architectures_list, cleanup=False)

root = path.dirname(path.abspath(__file__))
popularity = []
with open(path.join(root, "popularity.csv"), newline='') as csvfile:
    reader = csv.DictReader(csvfile)
    for line in reader:
        popularity.append(dict(line))

for file in all_architectures_list:
    article = {}
    pricing = False
    deploy = False
    sample_code = False

    file_path = Path(file)
    str_path = str(file_path)

    # Skip index pages
    # print(str_path)
    if is_excluded(file_path):
        plt.title("Speed distribution for vessel " + str(mmsi))
        plt.xlabel("Speed, Knots")
        plt.ylabel("Frequency")
        fig = plt.gcf()
        fig_size = plt.rcParams["figure.figsize"]

        # Prints: [8.0, 6.0]
        # Set figure width to 12 and height to 9
        fig_size[0] = 9
        fig_size[1] = 2.5
        plt.rcParams["figure.figsize"] = fig_size
        plt.gcf().subplots_adjust(bottom=0.20)

        fig.set_size_inches(fig_size[0], fig_size[1])
        fig.savefig(outdir + str(mmsi) + '.png', dpi=100, bbox_inches='tight')
        print "succeed, ", mmsi
        plt.clf()
    else:
        print "not enough data for ", mmsi

    # plt.show()


with open(sourcedir + filename, 'rU') as f:
    reader = csv.DictReader(f, delimiter=',')
    for row in reader:
        mmsi = row['mmsi']
        #   months_2014 = row['months_2014'].split(",")
        months_2015 = row['months_2015'].split(",")
        make_histogram(mmsi, months_2015)
Exemplo n.º 4
0
    def read_schedules(self):
        """
        Go through each student schedule and compile each course and its conflict as well as the total course
        list and the student object list
        :return: tuple containing: list of students, list of all courses, dictionary of courses and conflicts
        (and empty colors)
        """
        with open(self.schedules_csv, newline='') as fp:
            reader = csv.DictReader(fp)
            students = []
            courses = []
            conflicts = {}

            for row in reader:
                # get the course numbers for the current student
                student_course_nums = []
                for i in range(1, 7):
                    course_num = row['Course {}'.format(str(i))]
                    if course_num is not None:
                        student_course_nums.append(course_num)

                # loop through each course that we found
                for student_course_num in student_course_nums:
                    # disregard blank courses (not a full schedule)
                    if student_course_num is not None:
                        # add it to the full course list
                        if student_course_num not in courses:
                            courses.append(student_course_num)

                        if student_course_num not in conflicts:
                            # Conflicts should be all of the courses that are in the list besides the current course,
                            # as it is the first entry
                            conflicts[student_course_num] = [
                                x for i, x in enumerate(student_course_nums)
                                if i != student_course_nums.index(
                                    student_course_num)
                            ]
                        else:
                            course_conflicts = conflicts[student_course_num]
                            # Add conflicts to the existing course
                            # Should be all courses besides the current one and that are not already in the list
                            new_conflicts = [
                                x for i, x in enumerate(student_course_nums)
                                if i != student_course_nums.index(
                                    student_course_num)
                                and x not in course_conflicts
                            ]
                            # add the new conflicts we found to the list
                            if len(new_conflicts) > 0:
                                for i in range(len(new_conflicts)):
                                    course_conflicts.append(new_conflicts[i])
                            # put it back in the dictionary
                            conflicts[student_course_num] = course_conflicts
                # add a student object to the list
                students.append(
                    Student(row['Lastname'], row['Firstname'],
                            student_course_nums))

        # set all the course colors (second item in the tuple) to None for now, handled later
        for course in conflicts:
            conflicts[course] = (conflicts[course], None)

        return students, courses, conflicts
Exemplo n.º 5
0
 def start_requests(self):
     with open('newsUrlCrawl.csv') as csvfile:
         reader = csv.DictReader(csvfile)
         for row in reader:
             yield scrapy.Request(row['url'], self.parse_news)
Exemplo n.º 6
0
import csv
from numpy import *
from pymc import *

household_data = [d for d in csv.DictReader(open('household.csv'))]
county_data = [d for d in csv.DictReader(open('county.csv'))]

# hyper-priors
g = Uniform('gamma', [-100, -100], [100, 100])

s_a = Uniform('sigma_a', 0, 100)

# priors
a = {}
for d in county_data:

    @stochastic(name='a_%s' % d['county'])
    def a_j(value=0., g=g, u_j=float(d['u']), s_a=s_a):
        return normal_like(value, g[0] + g[1] * u_j, s_a**-2.)

    a[d['county']] = a_j

b = Uniform('beta', -100, 100)

s_y = Uniform('sigma_y', 0, 100)

# likelihood
y = {}
for d in household_data:

    @stochastic(observed=True, name='y_%s' % d['household'])
Exemplo n.º 7
0
def generate_images_annotations_json(main_dir,
                                     metadata_dir,
                                     subset,
                                     cls_index,
                                     version='v4'):
    validation_image_ids = {}

    if version == 'v4':
        annotations_path = os.path.join(
            metadata_dir, subset, '{}-annotations-bbox.CSV'.format(subset))
    elif version == 'challenge2018':
        validation_image_ids_path = os.path.join(
            metadata_dir, 'challenge-2018-image-ids-valset-od.CSV')

        with open(validation_image_ids_path, 'r') as csv_file:
            reader = csv.DictReader(csv_file, fieldnames=['ImageID'])
            next(reader)
            for line, row in enumerate(reader):
                image_id = row['ImageID']
                validation_image_ids[image_id] = True

        annotations_path = os.path.join(
            metadata_dir, 'challenge-2018-train-annotations-bbox.CSV')
    else:
        annotations_path = os.path.join(metadata_dir, subset,
                                        'annotations-human-bbox.CSV')

    fieldnames = [
        'ImageID', 'Source', 'LabelName', 'Confidence', 'XMin', 'XMax', 'YMin',
        'YMax', 'IsOccluded', 'IsTruncated', 'IsGroupOf', 'IsDepiction',
        'IsInside'
    ]

    id_annotations = dict()
    with open(annotations_path, 'r') as csv_file:
        reader = csv.DictReader(csv_file, fieldnames=fieldnames)
        next(reader)

        images_sizes = {}
        for line, row in enumerate(reader):
            frame = row['ImageID']

            if version == 'challenge2018':
                if subset == 'train':
                    if frame in validation_image_ids:
                        continue
                elif subset == 'validation':
                    if frame not in validation_image_ids:
                        continue
                else:
                    raise NotImplementedError(
                        'This generator handles only the train and validation subsets'
                    )

            class_name = row['LabelName']

            if class_name not in cls_index:
                continue

            cls_id = cls_index[class_name]

            if version == 'challenge2018':
                # We recommend participants to use the provided subset of the training set as a validation set.
                # This is preferable over using the V4 val/test sets, as the training set is more densely annotated.
                img_path = os.path.join(main_dir, 'images', 'train',
                                        frame + '.jpg')
            else:
                img_path = os.path.join(main_dir, 'images', subset,
                                        frame + '.jpg')

            if frame in images_sizes:
                width, height = images_sizes[frame]
            else:
                try:
                    with Image.open(img_path) as img:
                        width, height = img.width, img.height
                        images_sizes[frame] = (width, height)
                except Exception as ex:
                    if version == 'challenge2018':
                        raise ex
                    continue

            x1 = float(row['XMin'])
            x2 = float(row['XMax'])
            y1 = float(row['YMin'])
            y2 = float(row['YMax'])

            x1_int = int(round(x1 * width))
            x2_int = int(round(x2 * width))
            y1_int = int(round(y1 * height))
            y2_int = int(round(y2 * height))

            # Check that the bounding box is valid.
            if x2 <= x1:
                raise ValueError(
                    'line {}: x2 ({}) must be higher than x1 ({})'.format(
                        line, x2, x1))
            if y2 <= y1:
                raise ValueError(
                    'line {}: y2 ({}) must be higher than y1 ({})'.format(
                        line, y2, y1))

            if y2_int == y1_int:
                warnings.warn(
                    'filtering line {}: rounding y2 ({}) and y1 ({}) makes them equal'
                    .format(line, y2, y1))
                continue

            if x2_int == x1_int:
                warnings.warn(
                    'filtering line {}: rounding x2 ({}) and x1 ({}) makes them equal'
                    .format(line, x2, x1))
                continue

            img_id = row['ImageID']
            annotation = {
                'cls_id': cls_id,
                'x1': x1,
                'x2': x2,
                'y1': y1,
                'y2': y2
            }

            if img_id in id_annotations:
                annotations = id_annotations[img_id]
                annotations['boxes'].append(annotation)
            else:
                id_annotations[img_id] = {
                    'w': width,
                    'h': height,
                    'boxes': [annotation]
                }
    return id_annotations
def df_input_schema(_context, path):
    with open(path, 'r') as fd:
        return [
            OrderedDict(sorted(x.items(), key=lambda x: x[0]))
            for x in csv.DictReader(fd)
        ]
Exemplo n.º 9
0
def cvs_upload(request):
    if request.method == "POST":
        form = CVSUploadForm(request.POST, request.FILES)
        if form.is_valid():
            form.save()
            CSV_PATH = settings.MEDIA_ROOT + str(CVSUpload.objects.last().file)
            with open(CSV_PATH, encoding='utf-8-sig') as csvfile:
                data_reader = csv.DictReader(csvfile)
                for row in data_reader:
                    print(row['calling_plan'], '있음')
                    try:
                        plan = CallingPlan.objects.get(
                            calling_plan=row['calling_plan'],
                            mobile_carrier=row['mobile_carrier'])
                        if plan:
                            plan.company = row['company']
                            plan.brand = row['brand']
                            plan.homepage = row['homepage']
                            plan.calling_plan = row['calling_plan']
                            plan.mobile_carrier = row['mobile_carrier']
                            plan.category = row['category']
                            plan.data_speed = row['data_speed']
                            plan.data_category = row['data_category']
                            plan.data_unit = row['data_unit']
                            plan.call = row['call']
                            plan.call_unit = row['call_unit']
                            plan.unlimited_free = row['unlimited_free']
                            plan.message = row['message']
                            plan.message_unit = row['message_unit']
                            plan.data1 = row['data1']
                            plan.data2 = row['data2']
                            plan.pay = row['pay']
                            plan.promo_pay = row['promo_pay']
                            # calling_plan.saled_pay1 = row['saled_pay1']
                            # calling_plan.saled_pay2 = row['saled_pay2']
                            # calling_plan.saled_pay3 = row['saled_pay3']
                            # calling_plan.sales_pay1 = row['sales_pay1']
                            # calling_plan.condition1 = row['condition1']
                            # calling_plan.sales_pay2 = row['sales_pay2']
                            # calling_plan.condition2 = row['condition2']
                            # calling_plan.sales_pay3 = row['sales_pay3']
                            # calling_plan.condition3 = row['condition3']
                            # calling_plan.etc1 = row['etc1']
                            # calling_plan.etc2 = row['etc2']
                            # calling_plan.etc3 = row['etc3']
                            plan.activation = row['activation']
                            plan.update_date = timezone.localtime()
                            plan.save()
                    except ObjectDoesNotExist:
                        print(row['calling_plan'], '추가')
                        data = CallingPlan.objects.create(
                            company=row['company'],
                            brand=row['brand'],
                            homepage=row['homepage'],
                            calling_plan=row['calling_plan'],
                            mobile_carrier=row['mobile_carrier'],
                            category=row['category'],
                            data_speed=row['data_speed'],
                            data_category=row['data_category'],
                            data_unit=row['data_unit'],
                            call=row['call'],
                            call_unit=row['call_unit'],
                            unlimited_free=row['unlimited_free'],
                            message=row['message'],
                            message_unit=row['message_unit'],
                            data1=row['data1'],
                            data2=row['data2'],
                            pay=row['pay'],
                            promo_pay=row['promo_pay'],
                            # saled_pay1 = row['saled_pay1'],
                            # saled_pay2 = row['saled_pay2'],
                            # saled_pay3 = row['saled_pay3'],
                            # sales_pay1 = row['sales_pay1'],
                            # condition1 = row['condition1'],
                            # sales_pay2 = row['sales_pay2'],
                            # condition2 = row['condition2'],
                            # sales_pay3 = row['sales_pay3'],
                            # condition3 = row['condition3'],
                            # etc1 = row['etc1'],
                            # etc2 = row['etc2'],
                            # etc3 = row['etc3'],
                            activation=row['activation'],
                            update_date='',
                            create_date=timezone.localtime())
            return render(request, 'epost/cvs_uploaded.html')
    else:
        form = CVSUploadForm()
    return render(request, 'epost/cvs_upload.html', {'form': form})
Exemplo n.º 10
0
Translation.query.delete()
TranslationExample.query.delete()
Chapter.query.delete()
Book.query.delete()
GrammaticalTerm.query.delete()
Grammar.query.delete()
GrammarExample.query.delete()

v_count = 0
t_count = 0
te_count = 0
g_count = 0
ge_count = 0

with open('csv/main.csv') as csv_file:
    csv_reader = csv.DictReader(csv_file, delimiter=',')
    for row in csv_reader:
        v_id = row['vocabulary.id']
        v_hanzi = row['vocabulary.hanzi']
        v_chapter_id = row['vocabulary.chapter_id']
        v_number_in_chapter = row['vocabulary.number_in_chapter']
        if v_id and v_hanzi and v_chapter_id and v_number_in_chapter:
            v_count += 1
            v = Vocabulary(id=v_id,
                           hanzi=v_hanzi,
                           chapter_id=v_chapter_id,
                           number_in_chapter=v_number_in_chapter)
            db.session.add(v)
        t_id = row['translation.id']
        t_vocabulary_id = row['translation.vocabulary_id']
        t_translation_en = row['translation.translation_en']
Exemplo n.º 11
0
def loadparamsfromcsv(csvfilename, runs):
    """ Load and parse the csv file for the given set of runs and
    return nested dictionary: a collection of dictionaries, one for
    each csv row matching a run number.

    """
    import csv
    import os.path
    from sys import exit # use sys.exit instead of built-in exit (latter raises exception)

    class CommentedFile:
        """ Decorator for text files: filters out comments (i.e. first char of line #)
        Based on http://www.mfasold.net/blog/2010/02/python-recipe-read-csvtsv-textfiles-and-ignore-comment-lines/
        
        """
        def __init__(self, f, commentstring="#"):
            self.f = f
            self.commentstring = commentstring
            self.linecount = 0
        def rewind(self):
            self.f.seek(0)
            self.linecount = 0
        def next(self):
            line = self.f.next()
            self.linecount += 1
            while line.startswith(self.commentstring) or not line.strip(): # test if line commented or empty
                line = self.f.next()
                self.linecount += 1
            return line
        def __iter__(self):
            return self

    log = logging.getLogger('jobsub')
    parameters_csv = {} # store all information needed from the csv file
    if csvfilename is None: 
        return parameters_csv # if no file name given, return empty collection here
    if not os.path.isfile(csvfilename): # check if file exists
        log.error("Could not find the specified csv file '"+csvfilename+"'!")
        exit(1)
    try:
        log.debug("Opening csv file '"+csvfilename+"'.")
        csvfile = open(csvfilename, 'rb')
        filteredfile = CommentedFile(csvfile)
        try:
            # contruct a sample for the csv format sniffer:
            sample = ""
            try:
                while (len(sample)<1024):
                    sample += filteredfile.next()
            except StopIteration:
                log.debug("End of csv file reached, sample limited to " + str(len(sample))+ " bytes")
            dialect = csv.Sniffer().sniff(sample) # test csv file format details
            log.debug("Determined the CSV dialect as follows: delimiter=%s, doublequote=%s, escapechar=%s, lineterminator=%s, quotechar=%s , quoting=%s, skipinitialspace=%s", dialect.delimiter, dialect.doublequote, dialect.escapechar, list(ord(c) for c in dialect.lineterminator), dialect.quotechar, dialect.quoting, dialect.skipinitialspace)
            filteredfile.rewind() # back to beginning of file
            reader = csv.DictReader(filteredfile, dialect=dialect) # now process CSV file contents here and load them into memory
            reader.next() # python < 2.6 requires an actual read access before filling 'DictReader.fieldnames'
            log.debug("CSV file contains the header info: %s", reader.fieldnames)
            try:
                reader.fieldnames = [field.lower() for field in reader.fieldnames] # convert to lower case keys to avoid confusion
                reader.fieldnames = [field.strip() for field in reader.fieldnames] # remove leading and trailing white space
            except TypeError:
                log.error("Could not process the CSV file header information. csv.DictReader returned fieldnames: %s", reader.fieldnames)
                exit(1)
            if not "runnumber" in reader.fieldnames: # verify that we have a column "runnumber"
                log.error("Could not find a column with header label 'RunNumber' in file '"+csvfilename+"'!")
                exit(1)
            if "" in reader.fieldnames:
                log.warning("Column without header label encountered in csv file '"+csvfilename+"'!")
            log.info("Successfully loaded csv file'"+csvfilename+"'.")
            # first: search through csv file to find corresponding runnumber entry line for every run
            filteredfile.rewind() # back to beginning of file
            reader.next()   # .. and skip the header line
            missingRuns = list(runs) # list of runs to look for in csv file
            for row in reader: # loop over all rows once
                try:
                    for run in missingRuns: # check all runs if runnumber matches
                        if int(row["runnumber"]) == run:
                            log.debug("Found entry in csv file for run "+str(run)+" on line "+ str(filteredfile.linecount))
                            parameters_csv[run] = {}
                            parameters_csv[run].update(row)
                            missingRuns.remove(run)
                            break
                except ValueError: # int conversion error
                    log.warn("Could not interpret run number on line "+str(filteredfile.linecount)+" in file '"+csvfilename+"'.")
                    continue
                if len(missingRuns)==0:
                    log.debug("Finished search for runs in csv file before reaching end of file")
                    break
            log.debug("Searched over "+str(filteredfile.linecount)+" lines in file '"+csvfilename+"'.")
            if not len(missingRuns)==0:
                log.error("Could not find an entry for the following run numbers in '"+csvfilename+"': "+', '.join(map(str, missingRuns)))
        finally:
            csvfile.close()
    except csv.Error, e:
        log.error("Problem loading the csv file '"+csvfilename+"'(%s): %s"%(e.errno, e.strerror))
        exit(1)
Exemplo n.º 12
0
def flatten_class(out_loc, zoo_file):
    with open(out_loc, 'w', newline='', encoding='utf-8') as file:
        fieldnames = [
            'subject_ids', 'filename', 'user_name', 'workflow_id',
            'workflow_version', 'classification_id', 'created_at',
            'fluke_bounding_boxes', 'fluke_tip_points', 'fluke_notch_points'
        ]
        writer = csv.DictWriter(file, fieldnames=fieldnames)
        writer.writeheader()

        # this area for initializing counters:
        i = 0
        j = 0
        with open(zoo_file, 'r', encoding='utf-8') as csvfile:
            csvreader = csv.DictReader(csvfile, delimiter=',', quotechar='"')
            for row in csvreader:
                i += 1
                # # useful for debugging - set the number of record to process at a low number ~1000
                # if i == 1000:
                #     break
                if include(row) is True:
                    j += 1
                    anns = json.loads(row['annotations'])
                    subj = json.loads(row['subject_data'])

                    # recover the subject filename from the subject-data
                    filename = ''
                    for k in subj:
                        if "filename" in subj[k]:
                            filename = subj[k]['filename']
                        elif "Filename" in subj[k]:
                            filename = subj[k]['Filename']
                        else:
                            print("No filename found")
                            print(subj)
                    filename = filename.lower()

                    fluke_bounding_boxes = []
                    fluke_tip_points = []
                    fluke_notch_points = []
                    for ann in anns:
                        try:
                            # pull out boxes
                            if ann['task'] == 'T1':
                                for drawing_object in ann['value']:
                                    if pull_rectangle(drawing_object):
                                        fluke_bounding_boxes.append(
                                            pull_rectangle(drawing_object))
                            # pull out tip points
                            if ann['task'] == 'T2':
                                for drawing_object in ann['value']:
                                    if pull_point(drawing_object):
                                        fluke_tip_points.append(
                                            pull_point(drawing_object))
                            # pull out notch points
                            if ann['task'] == 'T3':
                                for drawing_object in ann['value']:
                                    if pull_point(drawing_object):
                                        fluke_notch_points.append(
                                            pull_point(drawing_object))
                        except KeyError:
                            continue

                    writer.writerow({
                        'subject_ids':
                        row['subject_ids'],
                        'filename':
                        filename,
                        'user_name':
                        row['user_name'],
                        'workflow_id':
                        row['workflow_id'],
                        'workflow_version':
                        row['workflow_version'],
                        'classification_id':
                        row['classification_id'],
                        'created_at':
                        row['created_at'],
                        'fluke_bounding_boxes':
                        json.dumps(fluke_bounding_boxes),
                        'fluke_tip_points':
                        json.dumps(fluke_tip_points),
                        'fluke_notch_points':
                        json.dumps(fluke_notch_points)
                    })
                if i % 10000 == 0:
                    print('flatten', i, j)
    return str(i) + ' Lines read and ' + str(j) + ' records processed'
Exemplo n.º 13
0
def aggregate(sorted_loc, aggregated_loc):
    with open(aggregated_loc, 'w', newline='', encoding='utf-8') as file:
        fieldnames = [
            'subject_ids', 'filename', 'classifications', 'boxes',
            'box_clusters', 'bclusters', 'tips', 'tip_clusters', 'tclusters',
            'notches', 'notch_clusters', 'nclusters', 'flukes'
        ]
        writer = csv.DictWriter(file, fieldnames=fieldnames)
        writer.writeheader()

        # set up to read the flattened file
        with open(sorted_loc, 'r', encoding='utf-8') as f:
            r = csv.DictReader(f)
            # initialize a starting point subject and empty bins for aggregation
            subject = ''
            users = ''
            filename = ''
            i = 1
            j = 0
            boxes = []
            tips = []
            notches = []

            # Loop over the flattened classification records
            for row in r:
                j += 1
                if j % 10000 == 0:
                    print('aggregating', j)
                # read a row and pullout the flattened data fields we need to aggregate, or output.
                new_subject = row['subject_ids']
                new_filename = row['filename']
                new_user = row['user_name']
                row_boxes = json.loads(row['fluke_bounding_boxes'])
                row_tips = json.loads(row['fluke_tip_points'])
                row_notches = json.loads(row['fluke_notch_points'])

                # test for change in selector - output on change
                if new_subject != subject:
                    if i != 1:  # if not the first line analyse the aggregated fields and output the results
                        new_row = process_aggregation(subject, filename, i,
                                                      boxes, tips, notches)
                        writer.writerow(new_row)

                    # reset the selector, those things we need to output and the bins for the aggregation.
                    i = 1
                    subject = new_subject
                    filename = new_filename
                    users = {new_user}
                    boxes = row_boxes
                    tips = row_tips
                    notches = row_notches

                else:
                    # do the aggregation - clean for excess classifications and multiple classifications by the same
                    # user on this subject
                    if users != users | {new_user}:
                        users |= {new_user}
                        boxes.extend(row_boxes)
                        tips.extend(row_tips)
                        notches.extend(row_notches)
                        i += 1

            # catch and process the last aggregated group
        new_row = process_aggregation(subject, filename, i, boxes, tips,
                                      notches)
        writer.writerow(new_row)
Exemplo n.º 14
0
                              '\n')
                if retry.lower() != 'y':
                    quit()
    else:
        flatten_class(out_location, zooniverse_file)
        sort_file(out_location, sorted_location, 0, False, True)
        aggregate(sorted_location, aggregated_location)

    # crawl the image directory and acquire the filenames
    imageFilenames, imageFilenameMap = get_filenames(fluke_images_dir)

    # load the aggregated WAI data and proceed to loop over the valid flukes. Load the matching image if any and
    # rotate and crop the image and save the cropped image.
    with open(aggregated_location, 'r', encoding='utf-8') as ag_file:
        images_not_processed = []
        r_ag = csv.DictReader(ag_file)
        for line in r_ag:
            fluke_positons = json.loads(line['flukes'])
            image = line['filename']
            if image not in imageFilenames:
                continue
            else:
                # a match has been found with one of the current images being analysed.
                realFilename = imageFilenameMap[image]
                # Read the image
                imageData = cv.imread(fluke_images_dir + os.sep + realFilename)
                width, height = imageData.shape[1], imageData.shape[0]
                counter = 0
                if len(
                        fluke_positons
                ) < 5:  # the invalid "something weird" fluke positions fail this test
Exemplo n.º 15
0
	def load(self,csv_file_name):

		raw_XX = [] # 3D list (2nd dim is mutable)
		raw_Y = []  # 2D list (2nd dim is mutable)
		raw_AA = []
		raw_MM = []

		with open(csv_file_name) as csv_file:
			reader = csv.DictReader(csv_file,delimiter=';')
			past_name = None
			X = []
			y = []
			A = []
			M = []
			for row in reader:
				# Each row corresponds to a frame (bar)
				# Using 'filename_sv' to determine song boundaries
				if past_name != row['filename_sv']:
					if X:
						raw_XX.append(X)
					if y:
						raw_Y.append(y)
					if A:
						raw_AA.append(A)
					if M:
						raw_MM.append(M)
					X = []
					y = []
					A = []
					M = []

				past_name = row['filename_sv']

				# Get rid of songs with no key
				if not row['key']:
					continue

				# Note: mode not currently used
				key, mode = self._process_key(row['key'])
				self.keys.append(key)
				X_i = self._process_Xi(row['tpc_hist_counts'])
				y_i = self._process_yi(row['chords_raw'],row['chord_types_raw'],key)
				A_i = self._process_Ai(row['tpc_raw'])
				M_i = self._process_Mi(row['metrical_weight'])

				# get rid of bars with no chords
				if not y_i:
					continue

				X.append(X_i)
				y.append(y_i)
				A.append(A_i)
				M.append(M_i)

			if X:
				raw_XX.append(X)
			if y:
				raw_Y.append(y)
			if A:
				raw_AA.append(A)
			if M:
				raw_MM.append(M)

		self.XX = self._process_XX(raw_XX)
		self.Y = self._process_Y(raw_Y)
		self.AA = self._process_AA(raw_AA)
		self.MM = self._process_MM(raw_MM)
Exemplo n.º 16
0
ontologyMap = {}
with open(args.ontology, 'rt') as fin:
  for line in fin:
    if line.startswith("[Term]"):
      clId = ""
      clName = ""
    elif line.startswith("id: CL:"):
      clId = line[7:].rstrip()
    elif line.startswith("name: ") and clId != "":
      clName = line[6:].rstrip()
      ontologyMap["CL_" + clId] = clName
      
ontologyGeneMap = {}
with open(args.input) as tsvfile:
  reader = csv.DictReader(tsvfile, dialect='excel-tab')
  with open(args.output, 'wt') as fout:
    for row in reader:
      olo = row["CellOntologyID"]
      
      if olo == "NA":
        continue
      
      if olo not in ontologyMap:
        print("Cannot find ontology " + olo)
        continue
      
      genes = row["geneSymbol"]
      
      if not olo in ontologyGeneMap:
        ontologyGeneMap[olo] = {}
Exemplo n.º 17
0
def read_data(data_file, types_file, miss_file, true_miss_file):
    
    #Read types of data from data file
    with open(types_file) as f:
        types_dict = [{k: v for k, v in row.items()}
        for row in csv.DictReader(f, skipinitialspace=True)]
    
    #Read data from input file
    with open(data_file, 'r') as f:
        data = [[float(x) for x in rec] for rec in csv.reader(f, delimiter=',')]
        data = np.array(data)
    
    #Sustitute NaN values by something (we assume we have the real missing value mask)
    if true_miss_file:
        with open(true_miss_file, 'r') as f:
            missing_positions = [[int(x) for x in rec] for rec in csv.reader(f, delimiter=',')]
            missing_positions = np.array(missing_positions)
            
        true_miss_mask = np.ones([np.shape(data)[0],len(types_dict)])
        true_miss_mask[missing_positions[:,0]-1,missing_positions[:,1]-1] = 0 #Indexes in the csv start at 1
        data_masked = np.ma.masked_where(np.isnan(data),data) 
        #We need to fill the data depending on the given data...
        data_filler = []
        for i in range(len(types_dict)):
            if types_dict[i]['type'] == 'cat' or types_dict[i]['type'] == 'ordinal':
                aux = np.unique(data[:,i])
                data_filler.append(aux[0])  #Fill with the first element of the cat (0, 1, or whatever)
            else:
                data_filler.append(0.0)
            
        data = data_masked.filled(data_filler)
    else:
        true_miss_mask = np.ones([np.shape(data)[0],len(types_dict)]) #It doesn't affect our data

    #Construct the data matrices
    data_complete = []
    for i in range(np.shape(data)[1]):
        
        if types_dict[i]['type'] == 'cat':
            #Get categories
            cat_data = [int(x) for x in data[:,i]]
            categories, indexes = np.unique(cat_data,return_inverse=True)
            #Transform categories to a vector of 0:n_categories
            new_categories = np.arange(int(types_dict[i]['dim']))
            cat_data = new_categories[indexes]
            #Create one hot encoding for the categories
            aux = np.zeros([np.shape(data)[0],len(new_categories)])
            aux[np.arange(np.shape(data)[0]),cat_data] = 1
            data_complete.append(aux)
            
        elif types_dict[i]['type'] == 'ordinal':
            #Get categories
            cat_data = [int(x) for x in data[:,i]]
            categories, indexes = np.unique(cat_data,return_inverse=True)
            #Transform categories to a vector of 0:n_categories
            new_categories = np.arange(int(types_dict[i]['dim']))
            cat_data = new_categories[indexes]
            #Create thermometer encoding for the categories
            aux = np.zeros([np.shape(data)[0],1+len(new_categories)])
            aux[:,0] = 1
            aux[np.arange(np.shape(data)[0]),1+cat_data] = -1
            aux = np.cumsum(aux,1)
            data_complete.append(aux[:,:-1])
            
        else:
            data_complete.append(np.transpose([data[:,i]]))
                    
    data = np.concatenate(data_complete,1)
    
        
    #Read Missing mask from csv (contains positions of missing values)
    n_samples = np.shape(data)[0]
    n_variables = len(types_dict)
    miss_mask = np.ones([np.shape(data)[0],n_variables])
    #If there is no mask, assume all data is observed
    if os.path.isfile(miss_file):
        with open(miss_file, 'r') as f:
            missing_positions = [[int(x) for x in rec] for rec in csv.reader(f, delimiter=',')]
            missing_positions = np.array(missing_positions)
        miss_mask[missing_positions[:,0]-1,missing_positions[:,1]-1] = 0 #Indexes in the csv start at 1
    
    return data, types_dict, miss_mask, true_miss_mask, n_samples
def read_and_create(container, data, mappings, object_type, create_new=False,
                     primary_key='id', counts = {}):
    new_count = 0
    existing_count = 0
    ignore_count = 0
    report = []


    # TODO(ivanteoh): Make sure the object have all the valid keys
    # keys = resources[0].keys()
    # hasProperty, getProperty not working

    catalog = api.portal.get_tool(name="portal_catalog")
    container_path = "/".join(container.getPhysicalPath())

    # TODO(ivanteoh): Make sure container is either folder or SiteRoot

    reader = csv.DictReader(data.splitlines(),
                            delimiter=",",
                            dialect="excel",
                            quotechar='"')

    # use IURLNormalizer instead of IIDNormalizer for url id
    normalizer = getUtility(IURLNormalizer)

    # return only fields are needed.
    for row in reader:

        ## set primary_key
        #if primary_key not in row:
        #    continue

        #key_value = row[primary_key].decode("utf-8")
        ## http://docs.plone.org/develop/plone/misc/normalizing_ids.html
        ## Normalizers to safe ids
        #fields[KEY_ID] = normalizer.normalize(key_value)

        key_arg = {}
        for key, value in row.items():
            if not key:
                continue
            if key in mappings:
                key_arg[mappings[key].decode("utf-8")] = \
                    value.decode("utf-8")


        # find existing obj
        obj = None
        if primary_key and primary_key not in key_arg:
            obj = None
            # in this case we shouldn't create or update it
            ignore_count += 1
            continue
        if primary_key in ['_path','id','_url']:
            if primary_key == '_url':
                path = '/'.join(getRequest().physicalPathFromURL(key_arg[primary_key]))
                if not path.startswith(container_path):
                    ignore_count += 1
                    continue
                path = path[len(container_path):].lstrip('/')
            else:
                path = key_arg[primary_key].encode().lstrip('/')
            obj = container.restrictedTraverse(path, None)
            if obj is None:
                # special case because id gets normalised.
                # try and guess the normalised id
                if primary_key == 'id':
                    # just in case id has '/' in
                    path = normalizer.normalize(key_arg[primary_key].encode())
                else:
                    path = path.rsplit('/',1)
                    path[-1] = normalizer.normalize(path[-1])
                    path = '/'.join(path)
                obj = container.restrictedTraverse(path, None)
            if 'id' not in key_arg:
                # ensure we don't use title
                key_arg['id'] = path.split('/')[-1]
            if obj is not None:
                existing_count += 1

        elif primary_key and primary_key in key_arg:
            # TODO: this is wrong since indexs aren't always the same as fields
            # Should check if there is an index, else back down to find util
            query = dict(path={"query": container_path, "depth": 1},
    #                    portal_type=object_type,
                         )
            query[primary_key]=key_arg[primary_key]
            results = catalog(**query)
            if len(results) > 1:
                assert "Primary key must be unique"
                ignore_count += 1
                continue
            elif len(results) == 1:
                obj = results[0].getObject()
                existing_count += 1

        if obj is None and create_new:
            #TODO: handle creating using passed in path. ie find/create folders
            # Save the objects in this container

            #TODO: validate we either have a id or title (or make random ids)

            #TODO: currently lets you create files without a require file field
            #which breaks on view

            obj = api.content.create(
                type=object_type,
                container=container,
                safe_id=True,
               **{key: key_arg[key] for key in ['id','title'] if key in key_arg}
            )
            new_count += 1
        elif obj is None:
            ignore_count += 1
            continue

        #if not checkPermission("zope.Modify", obj):
        #    ignore_count += 1
        #    continue


        key_arg['_path'] = '/'.join(obj.getPhysicalPath())[len(container_path)+1:]

        if 'id' in key_arg:
            del key_arg['id'] # otherwise transmogrifier renames it
        yield key_arg
        # TODO(ivanteoh): any performance risk by calling this?
        #TODO: only do this is we changed somthing
        notify(ObjectModifiedEvent(obj))

        #TODO: need to implement stop feature

        assert obj.id


        # generate report for csv export
#        key_arg[u"id"] = obj.id
#        key_arg[u'path'] = obj.absolute_url()
#        report.append(obj)

    # Later if want to rename
    # api.content.rename(obj=portal["blog"], new_id="old-blog")
    counts.update( {"existing_count": existing_count,
            "new_count": new_count,
            "ignore_count": ignore_count,
            "report": report} )
Exemplo n.º 19
0
# ДЗ №3: функции
# Дедлайн: 04 ноября 18:14
# Результат присылать на адрес [email protected]

# также прочитайте раздел "Функции" из книги "A byte of Python" (с.59)

# Задание: сделайте анализ возрастного состава группы студентов, используя функции.
# Помните, что а) у некоторых студентов отсутствуют данные по возрасту, б) возраст может быть задан диапазоном, например, 25-35. Поэтому не забывайте обрабатывать ошибки и исключения!

import csv

# помним, что в этот раз мы читаем не список списков, а список словарей!
# ключи в словаре для каждого студента называются по первой строчке из файла student_ages.csv: "Номер в списке", "Возраст"
ages_list = list()
with open('/Users/andreymakarov/Downloads/mai_python_2019/03 Functions/ages.csv', encoding="utf-8") as csvfile:
    ages_dictreader = csv.DictReader(csvfile, delimiter=',')
    ages_list = list(ages_dictreader)

#print(ages_list)

# подсказка: вот так мы можем получить данные из списка словарей
# именно так мы уже делали в коде лекции с квартирами
for al in ages_list:
    print(f'"Номер в списке": {al["Номер в списке"]}, "Возраст": {al["Возраст"]}')
print()

# Задание 1: напишите функцию, которая разделяет выборку студентов на две части: 
# меньше или равно указанного возраста и больше указанного возраста
# вернуться должна пара "Номер в списке, Возраст"
print("ПЕРВАЯ ФУНКЦИЯ")
print()
Exemplo n.º 20
0
from functions import getTBAdata
import csv
from scipy.stats.mstats import gmean
from numpy import std

dps = {}
with open("DistrictRankings/YearlyPredictor/data_award.csv") as file:
    reader = csv.DictReader(file)
    for row in reader:
        dps[row["Team"]] = float(row["Avg"])

scores = []
events = getTBAdata("events/2019/keys")
for event in events:
    print(event)
    teams = getTBAdata("event/" + event + "/teams/keys")
    if len(teams) < 2: continue
    localdps = [dps[t] if t in dps and dps[t]!=0 else 1 for t in teams]
    scores.append((event, gmean(localdps), std(localdps)))

scores = sorted(scores, key=lambda x:x[1]/x[2], reverse=True)

with open("DistrictRankings/YearlyPredictor/ranked_events.csv", "w+") as file:
    file.write("Event,Mean,StDev,Score\n")
    for e in scores:
        file.write(e[0] + "," + str(e[1]) + "," + str(e[2]) + "," + str(e[1]/e[2]) + "\n")
Exemplo n.º 21
0
def get_seq_with_max_average_blast_score_MDebug(taxon_fasta_filename,
                                                taxon_blast_filename):
    seqids_to_seq = get_relevant_seqids(taxon_fasta_filename)

    logger.debug(
        "Generating dictionary of bitscores between seqs according to %s" %
        taxon_blast_filename)
    with open(taxon_blast_filename) as f:
        dr = csv.DictReader(f,
                            delimiter='\t',
                            fieldnames=[
                                'query_id', 'subject_id', 'pct_identity',
                                'align_len', 'mismatches', 'gap_openings',
                                'q_start', 'q_end', 's_start', 's_end', 'eval',
                                'bitscore'
                            ])
        max_score_dict = dict()
        all_seq_ids = []
        for row in dr:
            row_key = get_row_key(row)
            query_id = row['query_id']
            sub_id = row['subject_id']
            if query_id not in all_seq_ids:
                all_seq_ids.append(query_id)
            if sub_id not in all_seq_ids:
                all_seq_ids.append(sub_id)
            #logger.debug("Adding the following key %s" % row_key)
            if row_key not in max_score_dict:
                max_score_dict[row_key] = -1.0
            score = float(row['bitscore'])
            if max_score_dict[row_key] < score:
                max_score_dict[row_key] = score

    seqid_to_average_score = dict()
    missing_keys = list()
    #logger.debug("MDebug : max_score_dict\n")
    #logger.debug(max_score_dict)
    for seqid in seqids_to_seq:
        average_bit_score = 0.0
        for other_seqid in all_seq_ids:
            if seqid != other_seqid:
                key = get_taxon_gis_key(seqid, other_seqid)
                if key not in max_score_dict:
                    missing_keys.append(key)
                else:
                    average_bit_score = average_bit_score + max_score_dict[key]
        average_bit_score /= len(seqids_to_seq) - 1
        seqid_to_average_score[seqid] = average_bit_score
    if len(missing_keys) > 0:
        logger.error("Didn't find the following keys in blast file %s: %s" %
                     (taxon_blast_filename, ",".join(missing_keys)))

    max_seqid = None
    max_average_bitscore = -1
    for seqid in seqid_to_average_score:
        # second check is done in order to make sure this method will always return the same seqid in case there are several seqs with the same average_bitscore
        if (max_average_bitscore < seqid_to_average_score[seqid]) or (
                max_average_bitscore == seqid_to_average_score[seqid]
                and seqid > max_seqid):
            max_average_bitscore = seqid_to_average_score[seqid]
            max_seqid = seqid

    logger.info(
        "Max average bitscore is %f for %s .Found the following average bit scores per GI %s"
        % (max_average_bitscore, max_seqid, seqid_to_average_score))
    return seqids_to_seq[max_seqid]
Exemplo n.º 22
0
try:
    input_file = sys.argv[1]
    start_date = sys.argv[2]
    csv_column = sys.argv[3]
    output_path = os.path.dirname(input_file)

    current_date = datetime.date.fromisoformat(start_date)
    cases_relation = {}

    counter = 0
    cases_per_day = 0
    check_constraint = 0
    deaths_per_day = 0
    with open(input_file, newline='') as f:
        reader = csv.DictReader(f)
        for row in reader:
            counter += 1
            print("Checking case #{0}".format(counter), end='\r')

            if (row['FECHA_INGRESO'] == "{0:%d}/{0:%m}/{0:%y}".format(
                    current_date)):
                if (row['RESULTADO'] == '1'):
                    cases_per_day += 1
                # A number one means that the constraint asserts to true
                if (row[csv_column] == '1'):
                    check_constraint += 1
                if (row['FECHA_DEF'] != '9999-99-99'):
                    deaths_per_day += 1
            else:
                print("\nChecked {0}".format(row['FECHA_INGRESO']))
Exemplo n.º 23
0
import csv

db = []

with open("libri.csv", newline="") as dbfile:
    dbreader = csv.DictReader(dbfile, delimiter=";")
    for row in dbreader:
        db.append(dict(row))

# Exercice 1
def cle_date(d):
    return d["date"]

def ex1():
    return sorted(db, key=cle_date)

# Exercice 2
def ex2():
    genres = []
    for entry in db:
        if (entry["genre"] not in genres):
            genres.append(entry["genre"])
    return genres

# Exercice 3
def ex3():
    titres = []
    for entry in db:
        if (int(entry["date"]) < 1820):
            titres.append(entry["titre"])
    return titres
Exemplo n.º 24
0
con = sqlite3.connect("safety_harbor.db")
cur = con.cursor()

#drop tables if they exists so we do not insert repeat data
for tablename in table_list:
    stmt = "DROP TABLE IF EXISTS " + tablename
    cur.execute(stmt)
    con.commit()

# create nodes table
cur.execute("CREATE TABLE IF NOT EXISTS nodes (id, lat, lon, user, uid, version, changeset, timestamp);")

# load table
with codecs.open('nodes.csv', encoding='utf-8-sig') as fin:
    dr = csv.DictReader(fin)
    pprint.pprint(dr.fieldnames)
    to_db = [(i['id'], i['lat'], i['lon'], i['user'], i['uid'], i['version'], i['changeset'], i['timestamp']) for i in dr]

cur.executemany("INSERT INTO nodes (id, lat, lon, user, uid, version, changeset, timestamp) \
                VALUES (?, ?, ?, ?, ?, ?, ?, ?);", to_db)


# create nodes_tags table
cur.execute("CREATE TABLE IF NOT EXISTS nodes_tags (id, key, value, type);")

# load table
with codecs.open('nodes_tags.csv', encoding='utf-8-sig') as fin:
    dr = csv.DictReader(fin)
    pprint.pprint(dr.fieldnames)
    to_db = [(i['id'], i['key'], i['value'], i['type']) for i in dr]
 
 cmdout=os.system('zcat < '+work_dir+'threatmetrix_payer_'+str(day)+'.csv.gz | head -1 > '+work_dir+'threatmetrix_payer_'+str(day)+'_sorted.csv')
 cmdout=os.system('zcat < '+work_dir+'threatmetrix_payer_'+str(day)+'.csv.gz | sed 1d | LC_ALL=C sort -t, -k1,1 >> '+work_dir+'threatmetrix_payer_'+str(day)+'_sorted.csv')
 cmdout=os.system('gzip '+work_dir+'threatmetrix_payer_'+str(day)+'_sorted.csv')
 
 
 header_out=['payment_request_id']+signal_names
 output_file=work_dir+"threatmetrix_payer_flat_"+str(day)+".csv.gz"
 outfile=gzip.open(output_file,'w')
 outcsv=csv.DictWriter(outfile, fieldnames=header_out)
 outcsv.writeheader()
 
 
 input_file=work_dir+"threatmetrix_payer_"+str(day)+"_sorted.csv.gz"
 infile=gzip.open(input_file,'rb')
 incsv=csv.DictReader(infile)
 
 row_flat = {}
 payment_request_id=''
 nRow=0
 nPayment=0
 for row in incsv:
     if not is_number(row['payment_request_id']) or row['payment_request_id'] == '':
         print "key not valid: ",row['payment_request_id']
         continue
     # tell if reach new payment
     if row['payment_request_id'] != payment_request_id:
         #output last row if not first time payment_request_id
         if nRow != 0:
             #print row_flat
             outcsv.writerow(row_flat)
Exemplo n.º 26
0
from flask import Flask

app = Flask(__name__)


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument("--host", type=str)
    parser.add_argument("--port", type=int)
    parser.add_argument("--file", type=str)
    args = parser.parse_args()

    response = {}
    with open(args.file, encoding="utf8") as csvfile:
        reader = csv.DictReader(csvfile, delimiter=';', quotechar='"')

        for creature in reader:
            specie = creature["creature_type"]
            if specie not in response:
                response[specie] = {
                    "habitats": set(),
                    "colors": set(),
                    "heights": set(),
                    "feeds": set()
                }

            response[specie]["habitats"].add(
                creature["habitat"]
            )
            response[specie]["colors"].add(
Exemplo n.º 27
0
def read_from_csv_dict():
    with open('data.csv', 'rt') as f:
        csvin = csv.DictReader(f, fieldnames=['first', 'last'])
        for row in csvin:
            print row
Exemplo n.º 28
0
def __initialize_clients_from_storage():
    with open(CLIENT_TABLE, mode='r') as f:
        reader = csv.DictReader(f, fieldnames=CLIENTS_SCHEMA)

        for row in reader:
            clients.append(row)
Exemplo n.º 29
0
def readcsv(fname):
    """Reads the CSV file given and returns a list of dicts"""
    import csv
    reader = csv.DictReader(open(fname))
    ret = [row for row in reader]
    return ret
Exemplo n.º 30
0
  def vqa_v2_generator(self, data_dir, tmp_dir, datasets):
    """VQA v2 generator using image features."""
    _get_vqa_v2_annotations(tmp_dir, self._VQA_V2_ANNOTATION_URL)
    _get_vqa_v2_image_feature_dataset(tmp_dir, self._VQA_V2_FEATURE_URL)
    vocab_path = os.path.join(data_dir, self.vocab_filename)
    if not tf.gfile.Exists(vocab_path):
      vocab_tmp_path = os.path.join(tmp_dir, self.vocab_filename)
      tf.gfile.Copy(vocab_tmp_path, vocab_path)
      with tf.gfile.GFile(vocab_path, mode="r") as f:
        vocab_data = "<pad>\n<EOS>\n" + f.read() + "UNK\n"
      with tf.gfile.GFile(vocab_path, mode="w") as f:
        f.write(vocab_data)
    label_path = os.path.join(data_dir, self.label_filename)
    if not tf.gfile.Exists(label_path):
      label_tmp_path = os.path.join(tmp_dir, self.label_filename)
      tf.gfile.Copy(label_tmp_path, label_path)

    vocab_encoder = text_encoder.TokenTextEncoder(vocab_path, replace_oov="UNK")
    label_encoder = text_encoder.ClassLabelEncoder(
        class_labels_fname=label_path)

    # merge annotations
    annotation_json = []
    for _, annotation_file in datasets:
      annotation_path = os.path.join(tmp_dir, annotation_file)
      with tf.gfile.Open(annotation_path) as f:
        annotation_json += json.loads(f.read())
    annotation_count = len(annotation_json)
    tf.logging.info("Processing %d annotations for vqa v2" %(annotation_count))

    imageid2annotation = {}
    for anno in annotation_json:
      if anno["image_id"] not in imageid2annotation:
        imageid2annotation[anno["image_id"]] = [anno]
      else:
        imageid2annotation[anno["image_id"]].append(anno)

    csv.field_size_limit(sys.maxsize)
    for feature_file, _ in datasets:
      feature_file_path = os.path.join(tmp_dir, feature_file)
      with open(feature_file_path, "r+b") as tsv_file:
        csv_reader = csv.DictReader(
            tsv_file, delimiter="\t", fieldnames=self.feature_file_field_names)
        for item in csv_reader:
          item["num_boxes"] = int(item["num_boxes"])
          image_id = int(item["image_id"])
          image_w = float(item["image_w"])
          image_h = float(item["image_h"])
          bboxes = np.frombuffer(base64.decodestring(item["boxes"]),
                                 dtype=np.float32).reshape(
                                     (item["num_boxes"], -1))

          box_width = bboxes[:, 2] - bboxes[:, 0]
          box_height = bboxes[:, 3] - bboxes[:, 1]
          scaled_width = box_width / image_w
          scaled_height = box_height / image_h
          scaled_x = bboxes[:, 0] / image_w
          scaled_y = bboxes[:, 1] / image_h

          box_width = box_width[..., np.newaxis]
          box_height = box_height[..., np.newaxis]
          scaled_width = scaled_width[..., np.newaxis]
          scaled_height = scaled_height[..., np.newaxis]
          scaled_x = scaled_x[..., np.newaxis]
          scaled_y = scaled_y[..., np.newaxis]

          spatial_features = np.concatenate(
              (scaled_x,
               scaled_y,
               scaled_x + scaled_width,
               scaled_y + scaled_height,
               scaled_width,
               scaled_height),
              axis=1)

          if image_id in imageid2annotation:
            for anno in imageid2annotation[image_id]:
              question = vocab_encoder.encode(anno["question"])
              answer = [label_encoder.encode(ans) for ans in anno["answer"]]
              answer = answer if answer else [0]  # 0 indicates padding
              yield {
                  "image/feature":
                  np.frombuffer(base64.decodestring(item["features"]),
                                dtype=np.float32).tolist(),
                  "image/spatial_feature": spatial_features.flatten().tolist(),
                  "image/height": [image_h],
                  "image/width": [image_w],
                  "image/bboxes": bboxes.flatten().tolist(),
                  "image/image_id": [image_id],
                  "image/question_id": [anno["question_id"]],
                  "image/question": question,
                  "image/answer": answer,
              }

            del imageid2annotation[image_id]

    # assert all annotations are included
    assert not imageid2annotation