def parse_source_file(self, source_file, article_id_map): f = open(source_file, 'r') csv_reader = csv.reader(f) headers = csv_reader.next() header_map = {} for i,h in enumerate(headers): header_map[h] = i # The headers are uniform in this file # lang,(seg_id1,tag1,seg1,img_url1,machine_translation1),...,(seg_idn,...) sa = SourceArticle() cur_aid = -1 language = None segments = ['seg_id%s' % i for i in xrange(1,11)] for line in csv_reader: segment_offsets = [(header_map[seg]) for seg in segments] for offs in segment_offsets: try: (aid, seg_id) = line[offs].split('_') except IndexError: # treating this basically like an eof try: sa.save(manually_splitting=True) except UnicodeDecodeError: print 'Argh! Unicode issues (1)...' sa.delete() break if int(seg_id) == 0: sa.sentences_processed = True language = line[0] try: self.save_sentence(sa, line[0], aid, article_id_map[aid]) except UnicodeDecodeError: print 'Argh! Unicode issues...(2)' sa.delete() # make a new sa object sa = SourceArticle() sa.save(manually_splitting=True) # get an id #tag = 'tag' tag = line[(offs + 1)] #seg = 'seg' seg = line[(offs + 2)] ss = SourceSentence() ss.article = sa ss.text = seg ss.segment_id = seg_id ss.end_of_paragraph = re.search("LastSentence", seg) or False ss.save() print '%s :: %s :: %s' % (aid, seg_id, tag)
def handle_noargs(self, **options): articles_of_interest = ArticleOfInterest.objects.all() for article in articles_of_interest: article_dict = query_text_rendered(article.title, language=article.title_language) # don't import articles we already have if SourceArticle.objects.filter(doc_id__exact='%s' % article_dict['revid'], language=article.title_language): continue try: source_article = SourceArticle( title=article.title, language=article.title_language, source_text=article_dict['html'], timestamp=datetime.now(), doc_id=article_dict['revid']) source_article.save() tr = TranslationRequest( article=source_article, target_language=article.target_language, date=datetime.now(), translator=DEFAULT_TRANNY) tr.save() except Exception as e: print type(e) print e.args try: source_article.delete() tr.delete() except: pass
def parse_source_file(self, source_file, article_id_map): f = open(source_file, 'r') csv_reader = unicode_csv_reader(f) headers = csv_reader.next() header_map = {} for i, h in enumerate(headers): header_map[h] = i # The headers are uniform in this file # lang,(seg_id1,tag1,seg1,img_url1,machine_translation1),...,(seg_idn,...) sa = None cur_aid = -1 language = None segments = ['seg_id%s' % i for i in xrange(1, 11)] for i, line in enumerate(csv_reader): segment_offsets = [(header_map[seg]) for seg in segments] for offs in segment_offsets: try: (aid, seg_id) = line[offs].split('_') except IndexError: # treating this basically like an eof break if cur_aid != int(aid): if sa: # save the previous SourceArticle sa.save(manually_splitting=True) # check if the document is already imported try: sa = SourceArticle.objects.filter( language=line[0]).get(doc_id=aid) sa.sentences_processed = True cur_aid = int(aid) language = line[0] sa.language = language sa.doc_id = aid sa.timestamp = datetime.now() sa.title = article_id_map[aid] sa.save(manually_splitting=True) # get an id for the SourceArticle instance except SourceArticle.DoesNotExist: # make a new sa object sa = SourceArticle() sa.sentences_processed = True cur_aid = int(aid) language = line[0] sa.language = language sa.doc_id = aid sa.timestamp = datetime.now() sa.title = article_id_map[aid] sa.save(manually_splitting=True) # get an id for the SourceArticle instance tag = line[(offs + 1)] seg = line[(offs + 2)] try: ss = sa.sourcesentence_set.get(segment_id=seg_id) ss.text = seg ss.segment_id = seg_id ss.end_of_paragraph = re.search("LastSentence", tag) or False ss.save() except SourceSentence.DoesNotExist: ss = SourceSentence() ss.article = sa ss.text = seg ss.segment_id = seg_id ss.end_of_paragraph = re.search("LastSentence", tag) or False ss.save() sa.source_text += seg + u'\n' if sa: sa.save(manually_splitting=True)
def parse_result_file(self, result_file, source_lang, target_lang): f = open(result_file, 'r') csv_reader = unicode_csv_reader(f) headers = csv_reader.next() header_map = {} for i, h in enumerate(headers): header_map[h] = i # not assuming a specific order for the fields sa = None cur_aid = -1 segment_ids = [ header_map[x] for x in ['Input.seg_id%d' % i for i in range(1, 11)] ] segments = [ header_map[x] for x in ['Input.seg%d' % i for i in range(1, 11)] ] translations = [ header_map[x] for x in ['Answer.translation%d' % i for i in range(1, 11)] ] ta = None has_title = 'Input.article' in header_map for line in csv_reader: if has_title: title = line[header_map['Input.article']] + ' (translated)' else: title = 'Noname (translated)' approved = (line[header_map['AssignmentStatus']] == 'Approved') for i in range(10): try: (aid, seg_id) = line[segment_ids[i]].split('_') except ValueError: # treating this basically like an eof break if cur_aid != int(aid): if sa: # save the previous SourceArticle sa.save(manually_splitting=True) # check if the document is already imported if not has_title: title = aid + ' ' + title try: sa = SourceArticle.objects.filter( language=source_lang).get(doc_id=aid) sa.sentences_processed = True cur_aid = int(aid) sa.language = source_lang sa.doc_id = aid sa.timestamp = datetime.now() sa.title = title sa.save(manually_splitting=True) # get an id for the SourceArticle instance except SourceArticle.DoesNotExist: # make a new sa object sa = SourceArticle() sa.sentences_processed = True cur_aid = int(aid) language = source_lang sa.language = language sa.doc_id = aid sa.timestamp = datetime.now() sa.title = title sa.save(manually_splitting=True) # get an id for the SourceArticle instance if ta: # save the previous target article ta.save() # check if the target article has been translated and imported try: ta = TranslatedArticle.objects.filter(article=sa).get( language=target_lang) # if there is one, do not touch unknown fields. ta.title = title ta.timestamp = datetime.now() ta.language = target_lang ta.approved = approved ta.save() except TranslatedArticle.DoesNotExist: # make a new TranslatedSentence object ta = TranslatedArticle() ta.article = sa ta.title = title ta.timestamp = datetime.now() ta.language = target_lang ta.approved = approved ta.save() end_of_paragraph = True tag_id = 'Input.tag%d' % i if tag_id in header_map: tag = line[header_map[tag_id]] end_of_paragraph = re.search("LastSentence", tag) or False seg = line[segments[i]] try: # do not touch end_of_paragraph because we do not know ss = sa.sourcesentence_set.get(segment_id=seg_id) ss.text = seg ss.segment_id = seg_id ss.end_of_paragraph = end_of_paragraph ss.save() except SourceSentence.DoesNotExist: ss = SourceSentence() ss.article = sa ss.text = seg ss.segment_id = seg_id ss.end_of_paragraph = end_of_paragraph ss.save() sa.source_text += seg + u'\n' translation = line[translations[i]] try: ts = ta.sentences.get(segment_id=seg_id) ts.source_sentence = ss ts.text = translation ts.translated_by = line[header_map['WorkerId']] ts.language = target_lang date_string = line[header_map['SubmitTime']] df = date_string.split(' ') tf = df[3].split(':') ts.translation_date = datetime(int(df[5]), [ 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec' ].index(df[1]) + 1, int(df[2]), int(tf[0]), int(tf[1]), int(tf[2])) ts.approved = approved ts.end_of_paragraph = ss.end_of_paragraph ts.save() except TranslatedSentence.DoesNotExist: ts = TranslatedSentence() ts.segment_id = seg_id ts.source_sentence = ss ts.text = translation ts.translated_by = line[header_map['WorkerId']] ts.language = target_lang date_string = line[header_map['SubmitTime']] df = date_string.split(' ') tf = df[3].split(':') ts.translation_date = datetime(int(df[5]), [ 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec' ].index(df[1]) + 1, int(df[2]), int(tf[0]), int(tf[1]), int(tf[2])) ts.approved = approved ts.end_of_paragraph = ss.end_of_paragraph ts.save() ta.sentences.add(ts) if sa: sa.save(manually_splitting=True) if ta: ta.save()