def parse_source_file(self, source_file, article_id_map): f = open(source_file, 'r') csv_reader = csv.reader(f) headers = csv_reader.next() header_map = {} for i,h in enumerate(headers): header_map[h] = i # The headers are uniform in this file # lang,(seg_id1,tag1,seg1,img_url1,machine_translation1),...,(seg_idn,...) sa = SourceArticle() cur_aid = -1 language = None segments = ['seg_id%s' % i for i in xrange(1,11)] for line in csv_reader: segment_offsets = [(header_map[seg]) for seg in segments] for offs in segment_offsets: try: (aid, seg_id) = line[offs].split('_') except IndexError: # treating this basically like an eof try: sa.save(manually_splitting=True) except UnicodeDecodeError: print 'Argh! Unicode issues (1)...' sa.delete() break if int(seg_id) == 0: sa.sentences_processed = True language = line[0] try: self.save_sentence(sa, line[0], aid, article_id_map[aid]) except UnicodeDecodeError: print 'Argh! Unicode issues...(2)' sa.delete() # make a new sa object sa = SourceArticle() sa.save(manually_splitting=True) # get an id #tag = 'tag' tag = line[(offs + 1)] #seg = 'seg' seg = line[(offs + 2)] ss = SourceSentence() ss.article = sa ss.text = seg ss.segment_id = seg_id ss.end_of_paragraph = re.search("LastSentence", seg) or False ss.save() print '%s :: %s :: %s' % (aid, seg_id, tag)
def parse_source_file(self, source_file, article_id_map): f = open(source_file, 'r') csv_reader = unicode_csv_reader(f) headers = csv_reader.next() header_map = {} for i, h in enumerate(headers): header_map[h] = i # The headers are uniform in this file # lang,(seg_id1,tag1,seg1,img_url1,machine_translation1),...,(seg_idn,...) sa = None cur_aid = -1 language = None segments = ['seg_id%s' % i for i in xrange(1, 11)] for i, line in enumerate(csv_reader): segment_offsets = [(header_map[seg]) for seg in segments] for offs in segment_offsets: try: (aid, seg_id) = line[offs].split('_') except IndexError: # treating this basically like an eof break if cur_aid != int(aid): if sa: # save the previous SourceArticle sa.save(manually_splitting=True) # check if the document is already imported try: sa = SourceArticle.objects.filter( language=line[0]).get(doc_id=aid) sa.sentences_processed = True cur_aid = int(aid) language = line[0] sa.language = language sa.doc_id = aid sa.timestamp = datetime.now() sa.title = article_id_map[aid] sa.save(manually_splitting=True) # get an id for the SourceArticle instance except SourceArticle.DoesNotExist: # make a new sa object sa = SourceArticle() sa.sentences_processed = True cur_aid = int(aid) language = line[0] sa.language = language sa.doc_id = aid sa.timestamp = datetime.now() sa.title = article_id_map[aid] sa.save(manually_splitting=True) # get an id for the SourceArticle instance tag = line[(offs + 1)] seg = line[(offs + 2)] try: ss = sa.sourcesentence_set.get(segment_id=seg_id) ss.text = seg ss.segment_id = seg_id ss.end_of_paragraph = re.search("LastSentence", tag) or False ss.save() except SourceSentence.DoesNotExist: ss = SourceSentence() ss.article = sa ss.text = seg ss.segment_id = seg_id ss.end_of_paragraph = re.search("LastSentence", tag) or False ss.save() sa.source_text += seg + u'\n' if sa: sa.save(manually_splitting=True)
def parse_result_file(self, result_file, source_lang, target_lang): f = open(result_file, 'r') csv_reader = unicode_csv_reader(f) headers = csv_reader.next() header_map = {} for i, h in enumerate(headers): header_map[h] = i # not assuming a specific order for the fields sa = None cur_aid = -1 segment_ids = [ header_map[x] for x in ['Input.seg_id%d' % i for i in range(1, 11)] ] segments = [ header_map[x] for x in ['Input.seg%d' % i for i in range(1, 11)] ] translations = [ header_map[x] for x in ['Answer.translation%d' % i for i in range(1, 11)] ] ta = None has_title = 'Input.article' in header_map for line in csv_reader: if has_title: title = line[header_map['Input.article']] + ' (translated)' else: title = 'Noname (translated)' approved = (line[header_map['AssignmentStatus']] == 'Approved') for i in range(10): try: (aid, seg_id) = line[segment_ids[i]].split('_') except ValueError: # treating this basically like an eof break if cur_aid != int(aid): if sa: # save the previous SourceArticle sa.save(manually_splitting=True) # check if the document is already imported if not has_title: title = aid + ' ' + title try: sa = SourceArticle.objects.filter( language=source_lang).get(doc_id=aid) sa.sentences_processed = True cur_aid = int(aid) sa.language = source_lang sa.doc_id = aid sa.timestamp = datetime.now() sa.title = title sa.save(manually_splitting=True) # get an id for the SourceArticle instance except SourceArticle.DoesNotExist: # make a new sa object sa = SourceArticle() sa.sentences_processed = True cur_aid = int(aid) language = source_lang sa.language = language sa.doc_id = aid sa.timestamp = datetime.now() sa.title = title sa.save(manually_splitting=True) # get an id for the SourceArticle instance if ta: # save the previous target article ta.save() # check if the target article has been translated and imported try: ta = TranslatedArticle.objects.filter(article=sa).get( language=target_lang) # if there is one, do not touch unknown fields. ta.title = title ta.timestamp = datetime.now() ta.language = target_lang ta.approved = approved ta.save() except TranslatedArticle.DoesNotExist: # make a new TranslatedSentence object ta = TranslatedArticle() ta.article = sa ta.title = title ta.timestamp = datetime.now() ta.language = target_lang ta.approved = approved ta.save() end_of_paragraph = True tag_id = 'Input.tag%d' % i if tag_id in header_map: tag = line[header_map[tag_id]] end_of_paragraph = re.search("LastSentence", tag) or False seg = line[segments[i]] try: # do not touch end_of_paragraph because we do not know ss = sa.sourcesentence_set.get(segment_id=seg_id) ss.text = seg ss.segment_id = seg_id ss.end_of_paragraph = end_of_paragraph ss.save() except SourceSentence.DoesNotExist: ss = SourceSentence() ss.article = sa ss.text = seg ss.segment_id = seg_id ss.end_of_paragraph = end_of_paragraph ss.save() sa.source_text += seg + u'\n' translation = line[translations[i]] try: ts = ta.sentences.get(segment_id=seg_id) ts.source_sentence = ss ts.text = translation ts.translated_by = line[header_map['WorkerId']] ts.language = target_lang date_string = line[header_map['SubmitTime']] df = date_string.split(' ') tf = df[3].split(':') ts.translation_date = datetime(int(df[5]), [ 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec' ].index(df[1]) + 1, int(df[2]), int(tf[0]), int(tf[1]), int(tf[2])) ts.approved = approved ts.end_of_paragraph = ss.end_of_paragraph ts.save() except TranslatedSentence.DoesNotExist: ts = TranslatedSentence() ts.segment_id = seg_id ts.source_sentence = ss ts.text = translation ts.translated_by = line[header_map['WorkerId']] ts.language = target_lang date_string = line[header_map['SubmitTime']] df = date_string.split(' ') tf = df[3].split(':') ts.translation_date = datetime(int(df[5]), [ 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec' ].index(df[1]) + 1, int(df[2]), int(tf[0]), int(tf[1]), int(tf[2])) ts.approved = approved ts.end_of_paragraph = ss.end_of_paragraph ts.save() ta.sentences.add(ts) if sa: sa.save(manually_splitting=True) if ta: ta.save()
def parse_result_file(self, result_file, source_lang, target_lang): f = open(result_file, "r") csv_reader = unicode_csv_reader(f) headers = csv_reader.next() header_map = {} for i, h in enumerate(headers): header_map[h] = i # not assuming a specific order for the fields sa = None cur_aid = -1 segment_ids = [header_map[x] for x in ["Input.seg_id%d" % i for i in range(1, 11)]] segments = [header_map[x] for x in ["Input.seg%d" % i for i in range(1, 11)]] translations = [header_map[x] for x in ["Answer.translation%d" % i for i in range(1, 11)]] ta = None has_title = "Input.article" in header_map for line in csv_reader: if has_title: title = line[header_map["Input.article"]] + " (translated)" else: title = "Noname (translated)" approved = line[header_map["AssignmentStatus"]] == "Approved" for i in range(10): try: (aid, seg_id) = line[segment_ids[i]].split("_") except ValueError: # treating this basically like an eof break if cur_aid != int(aid): if sa: # save the previous SourceArticle sa.save(manually_splitting=True) # check if the document is already imported if not has_title: title = aid + " " + title try: sa = SourceArticle.objects.filter(language=source_lang).get(doc_id=aid) sa.sentences_processed = True cur_aid = int(aid) sa.language = source_lang sa.doc_id = aid sa.timestamp = datetime.now() sa.title = title sa.save(manually_splitting=True) # get an id for the SourceArticle instance except SourceArticle.DoesNotExist: # make a new sa object sa = SourceArticle() sa.sentences_processed = True cur_aid = int(aid) language = source_lang sa.language = language sa.doc_id = aid sa.timestamp = datetime.now() sa.title = title sa.save(manually_splitting=True) # get an id for the SourceArticle instance if ta: # save the previous target article ta.save() # check if the target article has been translated and imported try: ta = TranslatedArticle.objects.filter(article=sa).get(language=target_lang) # if there is one, do not touch unknown fields. ta.title = title ta.timestamp = datetime.now() ta.language = target_lang ta.approved = approved ta.save() except TranslatedArticle.DoesNotExist: # make a new TranslatedSentence object ta = TranslatedArticle() ta.article = sa ta.title = title ta.timestamp = datetime.now() ta.language = target_lang ta.approved = approved ta.save() end_of_paragraph = True tag_id = "Input.tag%d" % i if tag_id in header_map: tag = line[header_map[tag_id]] end_of_paragraph = re.search("LastSentence", tag) or False seg = line[segments[i]] try: # do not touch end_of_paragraph because we do not know ss = sa.sourcesentence_set.get(segment_id=seg_id) ss.text = seg ss.segment_id = seg_id ss.end_of_paragraph = end_of_paragraph ss.save() except SourceSentence.DoesNotExist: ss = SourceSentence() ss.article = sa ss.text = seg ss.segment_id = seg_id ss.end_of_paragraph = end_of_paragraph ss.save() sa.source_text += seg + u"\n" translation = line[translations[i]] try: ts = ta.sentences.get(segment_id=seg_id) ts.source_sentence = ss ts.text = translation ts.translated_by = line[header_map["WorkerId"]] ts.language = target_lang date_string = line[header_map["SubmitTime"]] df = date_string.split(" ") tf = df[3].split(":") ts.translation_date = datetime( int(df[5]), ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"].index( df[1] ) + 1, int(df[2]), int(tf[0]), int(tf[1]), int(tf[2]), ) ts.approved = approved ts.end_of_paragraph = ss.end_of_paragraph ts.save() except TranslatedSentence.DoesNotExist: ts = TranslatedSentence() ts.segment_id = seg_id ts.source_sentence = ss ts.text = translation ts.translated_by = line[header_map["WorkerId"]] ts.language = target_lang date_string = line[header_map["SubmitTime"]] df = date_string.split(" ") tf = df[3].split(":") ts.translation_date = datetime( int(df[5]), ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"].index( df[1] ) + 1, int(df[2]), int(tf[0]), int(tf[1]), int(tf[2]), ) ts.approved = approved ts.end_of_paragraph = ss.end_of_paragraph ts.save() ta.sentences.add(ts) if sa: sa.save(manually_splitting=True) if ta: ta.save()
def parse_source_file(self, source_file, article_id_map): f = open(source_file, 'r') csv_reader = unicode_csv_reader(f) headers = csv_reader.next() header_map = {} for i,h in enumerate(headers): header_map[h] = i # The headers are uniform in this file # lang,(seg_id1,tag1,seg1,img_url1,machine_translation1),...,(seg_idn,...) sa = None cur_aid = -1 language = None segments = ['seg_id%s' % i for i in xrange(1,11)] for i,line in enumerate(csv_reader): segment_offsets = [(header_map[seg]) for seg in segments] for offs in segment_offsets: try: (aid, seg_id) = line[offs].split('_') except IndexError: # treating this basically like an eof break if cur_aid != int(aid): if sa: # save the previous SourceArticle sa.save(manually_splitting=True) # check if the document is already imported try: sa = SourceArticle.objects.filter(language = line[0]).get(doc_id = aid) sa.sentences_processed = True cur_aid = int(aid) language = line[0] sa.language = language sa.doc_id = aid sa.timestamp = datetime.now() sa.title = article_id_map[aid] sa.save(manually_splitting=True) # get an id for the SourceArticle instance except SourceArticle.DoesNotExist: # make a new sa object sa = SourceArticle() sa.sentences_processed = True cur_aid = int(aid) language = line[0] sa.language = language sa.doc_id = aid sa.timestamp = datetime.now() sa.title = article_id_map[aid] sa.save(manually_splitting=True) # get an id for the SourceArticle instance tag = line[(offs + 1)] seg = line[(offs + 2)] try: ss = sa.sourcesentence_set.get(segment_id = seg_id) ss.text = seg ss.segment_id = seg_id ss.end_of_paragraph = re.search("LastSentence", tag) or False ss.save() except SourceSentence.DoesNotExist: ss = SourceSentence() ss.article = sa ss.text = seg ss.segment_id = seg_id ss.end_of_paragraph = re.search("LastSentence", tag) or False ss.save() sa.source_text += seg + u'\n' if sa: sa.save(manually_splitting=True)