def read_output_thanh():
    fname = './train_debugs/debug_full_05.txt'
    en_url = ''
    can_url = ''
    found_can = False
    cal_pairs = {}
    old_domain = ''
    count = 0
    with open(fname, 'rt') as f:
        for line in f:
            line = line.strip()
            ms = line.split()
            if len(ms)!=2:
                continue
            key, val = ms
            
            if key.startswith('----------'):
                en_url = line.split(' ')[1]
                found_can = False
                domain = get_domain(en_url)
                if old_domain is not None and domain != old_domain:
                    print old_domain , '\t', count
                    old_domain = domain
                    count = 0
            elif found_can==False and key.find(':')>=0:
                found_can = True
                can_url = val
                #print en_url + '\t' + can_url
                cal_pairs[en_url] = can_url
            elif key.startswith('---gold'):
                rank = int(key.replace('=', ',').split(',')[1])
                if rank==-1:
                    count +=1
            
    return cal_pairs
def read_output_thanh():
    fname = './train_debugs/debug_full_05.txt'
    en_url = ''
    can_url = ''
    found_can = False
    cal_pairs = {}
    old_domain = ''
    count = 0
    with open(fname, 'rt') as f:
        for line in f:
            line = line.strip()
            ms = line.split()
            if len(ms) != 2:
                continue
            key, val = ms

            if key.startswith('----------'):
                en_url = line.split(' ')[1]
                found_can = False
                domain = get_domain(en_url)
                if old_domain is not None and domain != old_domain:
                    print old_domain, '\t', count
                    old_domain = domain
                    count = 0
            elif found_can == False and key.find(':') >= 0:
                found_can = True
                can_url = val
                #print en_url + '\t' + can_url
                cal_pairs[en_url] = can_url
            elif key.startswith('---gold'):
                rank = int(key.replace('=', ',').split(',')[1])
                if rank == -1:
                    count += 1

    return cal_pairs
Exemplo n.º 3
0
def run1():  #make clues for train set
    #debug_domains = ['www.dakar.com', 'www.luontoportti.com', 'www.nauticnews.com', 'www.the-great-adventure.fr']
    debug_domains = ['bugadacargnel.com', 'www.ec.gc.ca']
    with open(train_pairs, 'rt') as f:
        for line in f:
            en_url, gold = line.strip().split('\t')
            en_url, gold = en_url.strip(), gold.strip()

            if debug and get_domain(en_url) not in debug_domains:
                continue

            get_candidates(en_url)
Exemplo n.º 4
0
def run1():#make clues for train set
    #debug_domains = ['www.dakar.com', 'www.luontoportti.com', 'www.nauticnews.com', 'www.the-great-adventure.fr']
    debug_domains = ['bugadacargnel.com', 'www.ec.gc.ca']
    with open(train_pairs, 'rt') as f:
        for line in f:
            en_url, gold = line.strip().split('\t')
            en_url, gold = en_url.strip(), gold.strip()

            if debug and get_domain(en_url) not in debug_domains:
                continue

            get_candidates(en_url)
Exemplo n.º 5
0
def get_candidates(en_url):
    '''Get all candidates for the given source English URL'''
    domain = get_domain(
        en_url
    )  #TODO get these coment back for run train data?? doo all en_url of a domain, so load
    load_domain_corpus(domain)

    en_page = en_corpus[en_url]
    unique_en_tokens = list(set(en_page.tokens))
    score = get_chance_score(en_page.tokens, unique_en_tokens)
    print('%f\t%d\t%d\t%d\t%d\t%s' %
          (score, len(en_page.tokens), len(unique_en_tokens), col_size,
           col_vocab_size, en_url)
          )  #min_socre, #en_len,  #en_vocab_size, col_size, #col_vocab_size
    return

    doc_col_scores = col_model_for_a_doc(en_page.tokens, unique_en_tokens)

    cans, scores = [], []

    #cans, scores = [''], [0]
    #max_score = float('-inf')

    #pq_result = []

    for fr_url in fr_corpus:
        #lrate = en_page.length/float(fr_corpus[fr_url].length)
        #if use_filter and (lrate < LENGTH_LOWER_BOUND or lrate > LENGTH_UPPER_BOUND):#filter length
        #continue
        #score = score_original(fr_url, en_page.tokens)
        #print('1:' + str(score))
        #score = score_original_optimal(fr_url, unique_en_tokens, doc_col_scores, max_score)
        score = score_original_optimal(fr_url, unique_en_tokens,
                                       doc_col_scores)
        #print('2:' + str(score))
        if score is not None:
            cans.append(fr_url)
            scores.append(score)

            #if score>max_score:
            #max_score = score
            #cans[0] = fr_url
            #scores[0] = score

            #if len(pq_result)<output_top:
            #heapq.heappush(pq_result, (score, fr_url))
            #else:
            #heapq.heappushpop(pq_result, (score, fr_url))

    cans, scores = sort_candidates(cans, scores)
    #cans, scores = np.array(cans), np.array(scores)
    return cans, scores
Exemplo n.º 6
0
def load_translation(domain):
    '''Load transation for the given domain, ignore all line of others domain.'''
    global tran_corpus
    tran_corpus = defaultdict(list)
    domain_found = False
    with open(tran_en, 'rt') as f:
        for line in f:
            url, line = line.strip().split('\t')
            url, line = url.strip(), line.strip().lower()
            if url == 'unknown_url': continue
            line_domain = get_domain(url)
            if line_domain == domain:  #a domain have transaltion in multiple line
                tran_corpus[url].append(line)
                domain_found = True
            elif domain_found:
                break
Exemplo n.º 7
0
def run1():
    #debug_domains = ['www.dakar.com', 'www.luontoportti.com', 'www.nauticnews.com', 'www.the-great-adventure.fr']
    debug_domains = ['eu.blizzard.com']
    with open(train_pairs, 'rt') as f:
        for line in f:
            en_url, gold = line.strip().split('\t')
            en_url, gold = en_url.strip(), gold.strip()

            if debug and get_domain(en_url) not in debug_domains:
                continue

            cans, scores = get_candidates(en_url)
            count_evaluate(en_url, cans, scores, gold)

    print_domain_summary(current_domain)
    print_summary()
Exemplo n.º 8
0
def load_translation(domain):
    '''Load transation for the given domain, ignore all line of others domain.'''
    global tran_corpus
    tran_corpus = defaultdict(list)
    domain_found = False
    with open(tran_en, 'rt') as f:
        for line in f:
            url, line = line.strip().split('\t')
            url, line = url.strip(), line.strip().lower()
            if url == 'unknown_url': continue
            line_domain = get_domain(url)
            if line_domain == domain:#a domain have transaltion in multiple line
                tran_corpus[url].append(line)
                domain_found = True
            elif domain_found:
                break
Exemplo n.º 9
0
def run1():
    #debug_domains = ['www.dakar.com', 'www.luontoportti.com', 'www.nauticnews.com', 'www.the-great-adventure.fr']
    debug_domains = ['eu.blizzard.com']
    with open(train_pairs, 'rt') as f:
        for line in f:
            en_url, gold = line.strip().split('\t')
            en_url, gold = en_url.strip(), gold.strip()

            if debug and get_domain(en_url) not in debug_domains:
                continue

            cans, scores = get_candidates(en_url)
            count_evaluate(en_url, cans, scores, gold)

    print_domain_summary(current_domain)
    print_summary()
Exemplo n.º 10
0
def get_candidates(en_url):
    '''Get all candidates for the given source English URL'''
    domain = get_domain(en_url)#TODO get these coment back for run train data?? doo all en_url of a domain, so load 
    load_domain_corpus(domain)

    en_page = en_corpus[en_url]
    unique_en_tokens = list(set(en_page.tokens))
    score = get_chance_score(en_page.tokens, unique_en_tokens)  
    print('%f\t%d\t%d\t%d\t%d\t%s'%(score, len(en_page.tokens), len(unique_en_tokens), col_size, col_vocab_size, en_url))#min_socre, #en_len,  #en_vocab_size, col_size, #col_vocab_size
    return

    doc_col_scores = col_model_for_a_doc(en_page.tokens, unique_en_tokens)

    cans, scores = [], []

    #cans, scores = [''], [0]
    #max_score = float('-inf')

    #pq_result = []
    
    for fr_url in fr_corpus:
        #lrate = en_page.length/float(fr_corpus[fr_url].length)
        #if use_filter and (lrate < LENGTH_LOWER_BOUND or lrate > LENGTH_UPPER_BOUND):#filter length
                #continue
        #score = score_original(fr_url, en_page.tokens)
        #print('1:' + str(score))
        #score = score_original_optimal(fr_url, unique_en_tokens, doc_col_scores, max_score)
        score = score_original_optimal(fr_url, unique_en_tokens, doc_col_scores)
        #print('2:' + str(score))
        if score is not None:
            cans.append(fr_url)
            scores.append(score)

            #if score>max_score:
                #max_score = score
                #cans[0] = fr_url
                #scores[0] = score

            #if len(pq_result)<output_top:
                #heapq.heappush(pq_result, (score, fr_url))
            #else:
                #heapq.heappushpop(pq_result, (score, fr_url))
                
    cans, scores = sort_candidates(cans, scores)
    #cans, scores = np.array(cans), np.array(scores)
    return cans, scores
Exemplo n.º 11
0
def get_candidates(en_url):
    '''Get all candidates for the given source English URL'''
    domain = get_domain(en_url)#TODO get these coment back for run train data?? doo all en_url of a domain, so load 
    load_domain_corpus(domain)

    en_page = en_corpus[en_url]
    en_text = ' '.join(en_page.tokens)
    if en_url not in cal_pairs:
        return 
    fr_url = cal_pairs[en_url]
    fr_text = ' '.join(tran_corpus[fr_url])
    if len(fr_text)==0 or len(en_text)==0:
        print('%s\t%s\t%f\t%f\t%f'%(en_url, fr_url, 20, 20, 20))
        return
    #print('debug len:', len(fr_text), 'en_len', len(en_text), sep='\t')

    en_model, en_len = get_pro_model(' '.join(en_page.tokens))
    fr_model, fr_len = get_pro_model(' '.join(tran_corpus[cal_pairs[en_url]]))

    kl = get_kl_divergence(en_model, en_len, fr_model, fr_len)
    cr = get_cross_entropy(en_model, en_len, fr_model, fr_len)
    cosine = get_cosine(en_model, en_len, fr_model, fr_len)
    print('%s\t%s\t%f\t%f\t%f'%(en_url, fr_url, kl, cr, cosine))

    return
    import pdb
    pdb.set_trace()

    #clues 1
    en_page = en_corpus[en_url]
    unique_en_tokens = list(set(en_page.tokens))
    score = get_chance_score(en_page.tokens, unique_en_tokens)  
    print('%f\t%d\t%d\t%d\t%d\t%s'%(score, len(en_page.tokens), len(unique_en_tokens), col_size, col_vocab_size, en_url))#min_socre, #en_len,  #en_vocab_size, col_size, #col_vocab_size
    return

    doc_col_scores = col_model_for_a_doc(en_page.tokens, unique_en_tokens)

    cans, scores = [], []

    #cans, scores = [''], [0]
    #max_score = float('-inf')

    #pq_result = []
    
    for fr_url in fr_corpus:
        #lrate = en_page.length/float(fr_corpus[fr_url].length)
        #if use_filter and (lrate < LENGTH_LOWER_BOUND or lrate > LENGTH_UPPER_BOUND):#filter length
                #continue
        #score = score_original(fr_url, en_page.tokens)
        #print('1:' + str(score))
        #score = score_original_optimal(fr_url, unique_en_tokens, doc_col_scores, max_score)
        score = score_original_optimal(fr_url, unique_en_tokens, doc_col_scores)
        #print('2:' + str(score))
        if score is not None:
            cans.append(fr_url)
            scores.append(score)

            #if score>max_score:
                #max_score = score
                #cans[0] = fr_url
                #scores[0] = score

            #if len(pq_result)<output_top:
                #heapq.heappush(pq_result, (score, fr_url))
            #else:
                #heapq.heappushpop(pq_result, (score, fr_url))
                
    cans, scores = sort_candidates(cans, scores)
    #cans, scores = np.array(cans), np.array(scores)
    return cans, scores