from lett import read_lett_iter


if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('lettfiles', help='input lett files',
                        type=argparse.FileType('r'), nargs='+')
    parser.add_argument('-translations',
                        help='url<TAB>translation pairs', required=True,
                        type=argparse.FileType('r'))
    parser.add_argument('-tlang', help='target language', default='fr')
    args = parser.parse_args()

    translation_urls = set()
    fh = args.translations
    if fh.name.endswith('.gz'):
        fh = gzip.GzipFile(fileobj=fh, mode='r')
    for line in fh:
        url = line.rstrip().split('\t', 1)[0]
        translation_urls.add(url)

    for lett_file in args.lettfiles:
        for page in read_lett_iter(lett_file, decode=False):
            if page.url in translation_urls:
                translation_urls.remove(page.url)

    if translation_urls:
        print "Could not find %d target urls: %s" \
            % (len(translation_urls), "\n".join(translation_urls))
Пример #2
0
if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('lettfiles',
                        help='input lett files',
                        type=argparse.FileType('r'),
                        nargs='+')
    parser.add_argument('-translations',
                        help='url<TAB>translation pairs',
                        required=True,
                        type=argparse.FileType('r'))
    parser.add_argument('-tlang', help='target language', default='fr')
    args = parser.parse_args()

    translation_urls = set()
    fh = args.translations
    if fh.name.endswith('.gz'):
        fh = gzip.GzipFile(fileobj=fh, mode='r')
    for line in fh:
        url = line.rstrip().split('\t', 1)[0]
        translation_urls.add(url)

    for lett_file in args.lettfiles:
        for page in read_lett_iter(lett_file, decode=False):
            if page.url in translation_urls:
                translation_urls.remove(page.url)

    if translation_urls:
        print "Could not find %d target urls: %s" \
            % (len(translation_urls), "\n".join(translation_urls))
    parser.add_argument('-multipairs', help='output file',
                        type=argparse.FileType('w'))
    parser.add_argument('-slang', help='source language', default='en')
    parser.add_argument('-tlang', help='target language', default='fr')
    args = parser.parse_args()

    train_pairs = read_url_pairs(args.train)

    source_urls = set(su for su, tu in train_pairs)
    target_urls = set(tu for su, tu in train_pairs)
    assert len(source_urls) == len(target_urls)

    for lett_file in args.lettfiles:
        hash2url_s, hash2url_t = defaultdict(list), defaultdict(list)
        n_s, n_t = 0, 0
        for page in read_lett_iter(lett_file):
            h = hash(page.text)

            if page.lang == args.slang:
                hash2url_s[h].append(page.url)
                n_s += 1
            elif page.lang == args.tlang:
                hash2url_t[h].append(page.url)
                n_t += 1

        if args.out:
            for h, urls in hash2url_s.iteritems():
                if len(urls) > 1:
                    args.out.write("%d\t%s\n" % (len(urls), "\t".join(urls)))

            for h, urls in hash2url_t.iteritems():
    parser.add_argument('lettfiles', help='input lett files',
                        type=argparse.FileType('r'), nargs='+')
    parser.add_argument('-train', help='training url pairs', required=True,
                        type=argparse.FileType('r'))
    parser.add_argument('-slang', help='source language', default='en')
    parser.add_argument('-tlang', help='target language', default='fr')
    args = parser.parse_args()

    train_pairs = read_url_pairs(args.train)

    source_urls = set(su for su, tu in train_pairs)
    target_urls = set(tu for su, tu in train_pairs)
    assert len(source_urls) == len(target_urls)

    for lett_file in args.lettfiles:
        for page in read_lett_iter(lett_file):
            if page.lang == args.slang:
                assert page.url not in target_urls, \
                    "%s is in %s and cannot be a target url" % (
                        page.url, page.lang)
                if page.url in source_urls:
                    source_urls.remove(page.url)
            elif page.lang == args.tlang:
                assert page.url not in source_urls, \
                    "%s is in %s and cannot be a source url" % (
                        page.url, page.lang)
                if page.url in target_urls:
                    target_urls.remove(page.url)
            else:  # ignore all other languages
                pass