def handle_noargs(self, **options):
     articles_of_interest = ArticleOfInterest.objects.all()
     for article in articles_of_interest:
         # don't import articles we already have
         if SourceArticle.objects.filter(title__exact='%s' % article.title,
                                         language=article.title_language):
             continue
         article_dict = query_text_rendered(
             article.title,
             language=article.title_language.code)
         try:
             source_article = SourceArticle(
                 title=article.title,
                 language=article.title_language,
                 source_text=article_dict['html'],
                 timestamp=datetime.now(),
                 doc_id=article_dict['revid']
                 )
             source_article.save()
         except Exception as e:
             print "Looks like we have an exception of type %s" % type(e)
             print "Exception args:", e.args
             try:
                 source_article.delete()
             except:
                 pass
Пример #2
0
 def handle_noargs(self, **options):
     articles_of_interest = ArticleOfInterest.objects.all()
     for article in articles_of_interest:
         article_dict = query_text_rendered(article.title,
                                            language=article.title_language)
         # don't import articles we already have
         if SourceArticle.objects.filter(doc_id__exact='%s' % article_dict['revid'],
                                         language=article.title_language):
             continue
         try:
             source_article = SourceArticle(title=article.title,
                                            language=article.title_language,
                                            source_text=article_dict['html'],
                                            timestamp=datetime.now(),
                                            doc_id=article_dict['revid'])
             source_article.save()
             tr = TranslationRequest(article=source_article,
                                      target_language=article.target_language,
                                      date=datetime.now(),
                                      translator=DEFAULT_TRANNY)
             tr.save()
         except Exception as e:
             print type(e)
             print e.args
             try:
                 source_article.delete()
                 tr.delete()
             except:
                 pass
Пример #3
0
 def handle_noargs(self, **options):
     articles_of_interest = ArticleOfInterest.objects.all()
     for article in articles_of_interest:
         article_dict = query_text_rendered(article.title,
                                            language=article.title_language)
         # don't import articles we already have
         if SourceArticle.objects.filter(doc_id__exact='%s' %
                                         article_dict['revid'],
                                         language=article.title_language):
             continue
         try:
             source_article = SourceArticle(
                 title=article.title,
                 language=article.title_language,
                 source_text=article_dict['html'],
                 timestamp=datetime.now(),
                 doc_id=article_dict['revid'])
             source_article.save()
             tr = TranslationRequest(
                 article=source_article,
                 target_language=article.target_language,
                 date=datetime.now(),
                 translator=DEFAULT_TRANNY)
             tr.save()
         except Exception as e:
             print type(e)
             print e.args
             try:
                 source_article.delete()
                 tr.delete()
             except:
                 pass
Пример #4
0
def request_translation(request, form_class=TranslationRequestForm, template_name="wt_articles/request_form.html", deletedId= -1, deleteAll = False, update = False):
    """
    deletedId in this context is the deleted article id
    """
    #Update
    if(update):
        from wikipydia import query_text_rendered, query_text_raw        
        from wt_articles import DEFAULT_TRANNY    
        if request.POST:
            post = request.POST.copy()
            user_form = UserForm(post, instance=request.user)
            if user_form.is_valid():
                user_form.save()
                response = redirect('/accounts/' + request.user.username)
        else:
            articles_of_interest = ArticleOfInterest.objects.all()        
            for article in articles_of_interest:            
                if SourceArticle.objects.filter(title=article.title, language=article.title_language):
                    continue
                #article_dict = query_text_raw(article.title,
                #                                   language=article.title_language)                                                                        
                article_dict = query_text_rendered(article.title,
                                               language=article.title_language)
                print(article.title, article.title_language)                       
                try:
                    source_article = SourceArticle(title=article.title,
                                               language=article.title_language,
                                               #source_text=article_dict['text'],
                                               source_text=article_dict['html'],
                                               timestamp=datetime.now(),
                                               doc_id=article_dict['revid'])
                    source_article.save()
                    tr = TranslationRequest(article=source_article,
                                         target_language=article.target_language,
                                         date=datetime.now(),
                                         translator=DEFAULT_TRANNY)
                    tr.save()                                    
                except Exception as e:
                    print type(e)
                    print e.args
                    try:
                        source_article.delete()
                        tr.delete()
                    except:
                        pass         
 ###Delete             
    if(deletedId != -1):
        article = ArticleOfInterest.objects.filter(id=deletedId)
        article.delete()
    if(deleteAll):
       ArticleOfInterest.objects.all().delete()        
    if request.method == "POST":
        request_form = form_class(request.POST)
        if request_form.is_valid():
            title = request_form.cleaned_data['title']
            title_language = request_form.cleaned_data['title_language']
            target_language = request_form.cleaned_data['target_language']
            exists = ArticleOfInterest.objects.filter(title__exact=title,
                                                      title_language__exact=title_language,
                                                      target_language__exact=target_language)
            if len(exists) < 1:
                translation_request = request_form.save(commit=False)
                translation_request.date = datetime.now()
                translation_request.save()
            #return render_to_response("wt_articles/requests_thankyou.html", {},
            #                          context_instance=RequestContext(request))
    else:
        request_form = form_class()
     
    articles = all_articles_of_interest()
    return render_to_response(template_name, {
        "request_form": request_form,
        "articles": articles,
    }, context_instance=RequestContext(request))
Пример #5
0
    def parse_source_file(self, source_file, article_id_map):
        f = open(source_file, 'r')
        csv_reader = unicode_csv_reader(f)
        headers = csv_reader.next()
        header_map = {}
        for i, h in enumerate(headers):
            header_map[h] = i

        # The headers are uniform in this file
        # lang,(seg_id1,tag1,seg1,img_url1,machine_translation1),...,(seg_idn,...)
        sa = None
        cur_aid = -1
        language = None
        segments = ['seg_id%s' % i for i in xrange(1, 11)]
        for i, line in enumerate(csv_reader):
            segment_offsets = [(header_map[seg]) for seg in segments]
            for offs in segment_offsets:
                try:
                    (aid, seg_id) = line[offs].split('_')
                except IndexError:
                    # treating this basically like an eof
                    break

                if cur_aid != int(aid):
                    if sa:
                        # save the previous SourceArticle
                        sa.save(manually_splitting=True)
                    # check if the document is already imported
                    try:
                        sa = SourceArticle.objects.filter(
                            language=line[0]).get(doc_id=aid)
                        sa.sentences_processed = True
                        cur_aid = int(aid)
                        language = line[0]
                        sa.language = language
                        sa.doc_id = aid
                        sa.timestamp = datetime.now()
                        sa.title = article_id_map[aid]
                        sa.save(manually_splitting=True)
                        # get an id for the SourceArticle instance
                    except SourceArticle.DoesNotExist:
                        # make a new sa object
                        sa = SourceArticle()
                        sa.sentences_processed = True
                        cur_aid = int(aid)
                        language = line[0]
                        sa.language = language
                        sa.doc_id = aid
                        sa.timestamp = datetime.now()
                        sa.title = article_id_map[aid]
                        sa.save(manually_splitting=True)
                        # get an id for the SourceArticle instance

                tag = line[(offs + 1)]
                seg = line[(offs + 2)]

                try:
                    ss = sa.sourcesentence_set.get(segment_id=seg_id)
                    ss.text = seg
                    ss.segment_id = seg_id
                    ss.end_of_paragraph = re.search("LastSentence",
                                                    tag) or False
                    ss.save()
                except SourceSentence.DoesNotExist:
                    ss = SourceSentence()
                    ss.article = sa
                    ss.text = seg
                    ss.segment_id = seg_id
                    ss.end_of_paragraph = re.search("LastSentence",
                                                    tag) or False
                    ss.save()
                    sa.source_text += seg + u'\n'

        if sa:
            sa.save(manually_splitting=True)
Пример #6
0
    def parse_result_file(self, result_file, source_lang, target_lang):
        f = open(result_file, 'r')
        csv_reader = unicode_csv_reader(f)
        headers = csv_reader.next()
        header_map = {}
        for i, h in enumerate(headers):
            header_map[h] = i

# not assuming a specific order for the fields
        sa = None
        cur_aid = -1
        segment_ids = [
            header_map[x]
            for x in ['Input.seg_id%d' % i for i in range(1, 11)]
        ]
        segments = [
            header_map[x] for x in ['Input.seg%d' % i for i in range(1, 11)]
        ]
        translations = [
            header_map[x]
            for x in ['Answer.translation%d' % i for i in range(1, 11)]
        ]
        ta = None
        has_title = 'Input.article' in header_map
        for line in csv_reader:
            if has_title:
                title = line[header_map['Input.article']] + ' (translated)'
            else:
                title = 'Noname (translated)'
            approved = (line[header_map['AssignmentStatus']] == 'Approved')
            for i in range(10):
                try:
                    (aid, seg_id) = line[segment_ids[i]].split('_')
                except ValueError:
                    # treating this basically like an eof
                    break

                if cur_aid != int(aid):
                    if sa:
                        # save the previous SourceArticle
                        sa.save(manually_splitting=True)
                    # check if the document is already imported
                    if not has_title:
                        title = aid + ' ' + title
                    try:
                        sa = SourceArticle.objects.filter(
                            language=source_lang).get(doc_id=aid)
                        sa.sentences_processed = True
                        cur_aid = int(aid)
                        sa.language = source_lang
                        sa.doc_id = aid
                        sa.timestamp = datetime.now()
                        sa.title = title
                        sa.save(manually_splitting=True)
                        # get an id for the SourceArticle instance
                    except SourceArticle.DoesNotExist:
                        # make a new sa object
                        sa = SourceArticle()
                        sa.sentences_processed = True
                        cur_aid = int(aid)
                        language = source_lang
                        sa.language = language
                        sa.doc_id = aid
                        sa.timestamp = datetime.now()
                        sa.title = title
                        sa.save(manually_splitting=True)
                        # get an id for the SourceArticle instance
                    if ta:
                        # save the previous target article
                        ta.save()
# check if the target article has been translated and imported
                    try:
                        ta = TranslatedArticle.objects.filter(article=sa).get(
                            language=target_lang)
                        # if there is one, do not touch unknown fields.
                        ta.title = title
                        ta.timestamp = datetime.now()
                        ta.language = target_lang
                        ta.approved = approved
                        ta.save()
                    except TranslatedArticle.DoesNotExist:
                        # make a new TranslatedSentence object
                        ta = TranslatedArticle()
                        ta.article = sa
                        ta.title = title
                        ta.timestamp = datetime.now()
                        ta.language = target_lang
                        ta.approved = approved
                        ta.save()

                end_of_paragraph = True
                tag_id = 'Input.tag%d' % i
                if tag_id in header_map:
                    tag = line[header_map[tag_id]]
                    end_of_paragraph = re.search("LastSentence", tag) or False

                seg = line[segments[i]]
                try:
                    # do not touch end_of_paragraph because we do not know
                    ss = sa.sourcesentence_set.get(segment_id=seg_id)
                    ss.text = seg
                    ss.segment_id = seg_id
                    ss.end_of_paragraph = end_of_paragraph
                    ss.save()
                except SourceSentence.DoesNotExist:
                    ss = SourceSentence()
                    ss.article = sa
                    ss.text = seg
                    ss.segment_id = seg_id
                    ss.end_of_paragraph = end_of_paragraph
                    ss.save()
                    sa.source_text += seg + u'\n'

                translation = line[translations[i]]
                try:
                    ts = ta.sentences.get(segment_id=seg_id)
                    ts.source_sentence = ss
                    ts.text = translation
                    ts.translated_by = line[header_map['WorkerId']]
                    ts.language = target_lang
                    date_string = line[header_map['SubmitTime']]
                    df = date_string.split(' ')
                    tf = df[3].split(':')
                    ts.translation_date = datetime(int(df[5]), [
                        'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug',
                        'Sep', 'Oct', 'Nov', 'Dec'
                    ].index(df[1]) + 1, int(df[2]), int(tf[0]), int(tf[1]),
                                                   int(tf[2]))
                    ts.approved = approved
                    ts.end_of_paragraph = ss.end_of_paragraph
                    ts.save()
                except TranslatedSentence.DoesNotExist:
                    ts = TranslatedSentence()
                    ts.segment_id = seg_id
                    ts.source_sentence = ss
                    ts.text = translation
                    ts.translated_by = line[header_map['WorkerId']]
                    ts.language = target_lang
                    date_string = line[header_map['SubmitTime']]
                    df = date_string.split(' ')
                    tf = df[3].split(':')
                    ts.translation_date = datetime(int(df[5]), [
                        'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug',
                        'Sep', 'Oct', 'Nov', 'Dec'
                    ].index(df[1]) + 1, int(df[2]), int(tf[0]), int(tf[1]),
                                                   int(tf[2]))
                    ts.approved = approved
                    ts.end_of_paragraph = ss.end_of_paragraph
                    ts.save()
                    ta.sentences.add(ts)
        if sa:
            sa.save(manually_splitting=True)
        if ta:
            ta.save()
Пример #7
0
    def parse_result_file(self, result_file, source_lang, target_lang):
        f = open(result_file, "r")
        csv_reader = unicode_csv_reader(f)
        headers = csv_reader.next()
        header_map = {}
        for i, h in enumerate(headers):
            header_map[h] = i

        # not assuming a specific order for the fields
        sa = None
        cur_aid = -1
        segment_ids = [header_map[x] for x in ["Input.seg_id%d" % i for i in range(1, 11)]]
        segments = [header_map[x] for x in ["Input.seg%d" % i for i in range(1, 11)]]
        translations = [header_map[x] for x in ["Answer.translation%d" % i for i in range(1, 11)]]
        ta = None
        has_title = "Input.article" in header_map
        for line in csv_reader:
            if has_title:
                title = line[header_map["Input.article"]] + " (translated)"
            else:
                title = "Noname (translated)"
            approved = line[header_map["AssignmentStatus"]] == "Approved"
            for i in range(10):
                try:
                    (aid, seg_id) = line[segment_ids[i]].split("_")
                except ValueError:
                    # treating this basically like an eof
                    break

                if cur_aid != int(aid):
                    if sa:
                        # save the previous SourceArticle
                        sa.save(manually_splitting=True)
                    # check if the document is already imported
                    if not has_title:
                        title = aid + " " + title
                    try:
                        sa = SourceArticle.objects.filter(language=source_lang).get(doc_id=aid)
                        sa.sentences_processed = True
                        cur_aid = int(aid)
                        sa.language = source_lang
                        sa.doc_id = aid
                        sa.timestamp = datetime.now()
                        sa.title = title
                        sa.save(manually_splitting=True)
                        # get an id for the SourceArticle instance
                    except SourceArticle.DoesNotExist:
                        # make a new sa object
                        sa = SourceArticle()
                        sa.sentences_processed = True
                        cur_aid = int(aid)
                        language = source_lang
                        sa.language = language
                        sa.doc_id = aid
                        sa.timestamp = datetime.now()
                        sa.title = title
                        sa.save(manually_splitting=True)
                        # get an id for the SourceArticle instance
                    if ta:
                        # save the previous target article
                        ta.save()
                        # check if the target article has been translated and imported
                    try:
                        ta = TranslatedArticle.objects.filter(article=sa).get(language=target_lang)
                        # if there is one, do not touch unknown fields.
                        ta.title = title
                        ta.timestamp = datetime.now()
                        ta.language = target_lang
                        ta.approved = approved
                        ta.save()
                    except TranslatedArticle.DoesNotExist:
                        # make a new TranslatedSentence object
                        ta = TranslatedArticle()
                        ta.article = sa
                        ta.title = title
                        ta.timestamp = datetime.now()
                        ta.language = target_lang
                        ta.approved = approved
                        ta.save()

                end_of_paragraph = True
                tag_id = "Input.tag%d" % i
                if tag_id in header_map:
                    tag = line[header_map[tag_id]]
                    end_of_paragraph = re.search("LastSentence", tag) or False

                seg = line[segments[i]]
                try:
                    # do not touch end_of_paragraph because we do not know
                    ss = sa.sourcesentence_set.get(segment_id=seg_id)
                    ss.text = seg
                    ss.segment_id = seg_id
                    ss.end_of_paragraph = end_of_paragraph
                    ss.save()
                except SourceSentence.DoesNotExist:
                    ss = SourceSentence()
                    ss.article = sa
                    ss.text = seg
                    ss.segment_id = seg_id
                    ss.end_of_paragraph = end_of_paragraph
                    ss.save()
                    sa.source_text += seg + u"\n"

                translation = line[translations[i]]
                try:
                    ts = ta.sentences.get(segment_id=seg_id)
                    ts.source_sentence = ss
                    ts.text = translation
                    ts.translated_by = line[header_map["WorkerId"]]
                    ts.language = target_lang
                    date_string = line[header_map["SubmitTime"]]
                    df = date_string.split(" ")
                    tf = df[3].split(":")
                    ts.translation_date = datetime(
                        int(df[5]),
                        ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"].index(
                            df[1]
                        )
                        + 1,
                        int(df[2]),
                        int(tf[0]),
                        int(tf[1]),
                        int(tf[2]),
                    )
                    ts.approved = approved
                    ts.end_of_paragraph = ss.end_of_paragraph
                    ts.save()
                except TranslatedSentence.DoesNotExist:
                    ts = TranslatedSentence()
                    ts.segment_id = seg_id
                    ts.source_sentence = ss
                    ts.text = translation
                    ts.translated_by = line[header_map["WorkerId"]]
                    ts.language = target_lang
                    date_string = line[header_map["SubmitTime"]]
                    df = date_string.split(" ")
                    tf = df[3].split(":")
                    ts.translation_date = datetime(
                        int(df[5]),
                        ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"].index(
                            df[1]
                        )
                        + 1,
                        int(df[2]),
                        int(tf[0]),
                        int(tf[1]),
                        int(tf[2]),
                    )
                    ts.approved = approved
                    ts.end_of_paragraph = ss.end_of_paragraph
                    ts.save()
                    ta.sentences.add(ts)
        if sa:
            sa.save(manually_splitting=True)
        if ta:
            ta.save()
Пример #8
0
    def parse_source_file(self, source_file, article_id_map):
        f = open(source_file, 'r')
        csv_reader = csv.reader(f)
        headers = csv_reader.next()
        header_map = {}
        for i,h in enumerate(headers):
            header_map[h] = i

        # The headers are uniform in this file
        # lang,(seg_id1,tag1,seg1,img_url1,machine_translation1),...,(seg_idn,...)
        sa = SourceArticle()
        cur_aid = -1
        language = None
        segments = ['seg_id%s' % i for i in xrange(1,11)]
        for line in csv_reader:
            segment_offsets = [(header_map[seg]) for seg in segments]
            for offs in segment_offsets:
                try:
                    (aid, seg_id) = line[offs].split('_')
                except IndexError:
                    # treating this basically like an eof

                    try:
                        sa.save(manually_splitting=True)
                    except UnicodeDecodeError:
                        print 'Argh! Unicode issues (1)...'
                        sa.delete()
                    break
                
                if int(seg_id) == 0:
                    sa.sentences_processed = True
                    language = line[0]
                    try:
                        self.save_sentence(sa, line[0], aid, article_id_map[aid])
                    except UnicodeDecodeError:
                        print 'Argh! Unicode issues...(2)'
                        sa.delete()

                    # make a new sa object
                    sa = SourceArticle()
                sa.save(manually_splitting=True) # get an id
                #tag = 'tag'
                tag = line[(offs + 1)]
                #seg = 'seg'
                seg = line[(offs + 2)]
                ss = SourceSentence()
                ss.article = sa
                ss.text = seg
                ss.segment_id = seg_id
                ss.end_of_paragraph = re.search("LastSentence", seg) or False
                ss.save()
                print '%s :: %s :: %s' % (aid, seg_id, tag)
    def parse_source_file(self, source_file, article_id_map):
        f = open(source_file, 'r')
        csv_reader = unicode_csv_reader(f)
        headers = csv_reader.next()
        header_map = {}
        for i,h in enumerate(headers):
            header_map[h] = i

        # The headers are uniform in this file
        # lang,(seg_id1,tag1,seg1,img_url1,machine_translation1),...,(seg_idn,...)
        sa = None
        cur_aid = -1
        language = None
        segments = ['seg_id%s' % i for i in xrange(1,11)]
        for i,line in enumerate(csv_reader):
            segment_offsets = [(header_map[seg]) for seg in segments]
            for offs in segment_offsets:
                try:
                    (aid, seg_id) = line[offs].split('_')
                except IndexError:
                    # treating this basically like an eof
                    break
                
                if cur_aid != int(aid):
                    if sa:
                        # save the previous SourceArticle
                        sa.save(manually_splitting=True)
                    # check if the document is already imported
                    try:
                        sa = SourceArticle.objects.filter(language = line[0]).get(doc_id = aid)
                        sa.sentences_processed = True
                        cur_aid = int(aid)
                        language = line[0]
                        sa.language = language
                        sa.doc_id = aid 
                        sa.timestamp = datetime.now()
                        sa.title = article_id_map[aid]
                        sa.save(manually_splitting=True)
                        # get an id for the SourceArticle instance
                    except SourceArticle.DoesNotExist:
                        # make a new sa object
                        sa = SourceArticle()
                        sa.sentences_processed = True
                        cur_aid = int(aid)
                        language = line[0]
                        sa.language = language
                        sa.doc_id = aid 
                        sa.timestamp = datetime.now()
                        sa.title = article_id_map[aid]
                        sa.save(manually_splitting=True)
                        # get an id for the SourceArticle instance

                tag = line[(offs + 1)]
                seg = line[(offs + 2)]

                try:
                    ss = sa.sourcesentence_set.get(segment_id = seg_id)
                    ss.text = seg
                    ss.segment_id = seg_id
                    ss.end_of_paragraph = re.search("LastSentence", tag) or False
                    ss.save()
                except SourceSentence.DoesNotExist:
                    ss = SourceSentence()
                    ss.article = sa
                    ss.text = seg
                    ss.segment_id = seg_id
                    ss.end_of_paragraph = re.search("LastSentence", tag) or False
                    ss.save()
                    sa.source_text += seg + u'\n'
                
        if sa:
            sa.save(manually_splitting=True)