from lett import read_lett_iter if __name__ == "__main__": import argparse parser = argparse.ArgumentParser() parser.add_argument('lettfiles', help='input lett files', type=argparse.FileType('r'), nargs='+') parser.add_argument('-translations', help='url<TAB>translation pairs', required=True, type=argparse.FileType('r')) parser.add_argument('-tlang', help='target language', default='fr') args = parser.parse_args() translation_urls = set() fh = args.translations if fh.name.endswith('.gz'): fh = gzip.GzipFile(fileobj=fh, mode='r') for line in fh: url = line.rstrip().split('\t', 1)[0] translation_urls.add(url) for lett_file in args.lettfiles: for page in read_lett_iter(lett_file, decode=False): if page.url in translation_urls: translation_urls.remove(page.url) if translation_urls: print "Could not find %d target urls: %s" \ % (len(translation_urls), "\n".join(translation_urls))
if __name__ == "__main__": import argparse parser = argparse.ArgumentParser() parser.add_argument('lettfiles', help='input lett files', type=argparse.FileType('r'), nargs='+') parser.add_argument('-translations', help='url<TAB>translation pairs', required=True, type=argparse.FileType('r')) parser.add_argument('-tlang', help='target language', default='fr') args = parser.parse_args() translation_urls = set() fh = args.translations if fh.name.endswith('.gz'): fh = gzip.GzipFile(fileobj=fh, mode='r') for line in fh: url = line.rstrip().split('\t', 1)[0] translation_urls.add(url) for lett_file in args.lettfiles: for page in read_lett_iter(lett_file, decode=False): if page.url in translation_urls: translation_urls.remove(page.url) if translation_urls: print "Could not find %d target urls: %s" \ % (len(translation_urls), "\n".join(translation_urls))
parser.add_argument('-multipairs', help='output file', type=argparse.FileType('w')) parser.add_argument('-slang', help='source language', default='en') parser.add_argument('-tlang', help='target language', default='fr') args = parser.parse_args() train_pairs = read_url_pairs(args.train) source_urls = set(su for su, tu in train_pairs) target_urls = set(tu for su, tu in train_pairs) assert len(source_urls) == len(target_urls) for lett_file in args.lettfiles: hash2url_s, hash2url_t = defaultdict(list), defaultdict(list) n_s, n_t = 0, 0 for page in read_lett_iter(lett_file): h = hash(page.text) if page.lang == args.slang: hash2url_s[h].append(page.url) n_s += 1 elif page.lang == args.tlang: hash2url_t[h].append(page.url) n_t += 1 if args.out: for h, urls in hash2url_s.iteritems(): if len(urls) > 1: args.out.write("%d\t%s\n" % (len(urls), "\t".join(urls))) for h, urls in hash2url_t.iteritems():
parser.add_argument('lettfiles', help='input lett files', type=argparse.FileType('r'), nargs='+') parser.add_argument('-train', help='training url pairs', required=True, type=argparse.FileType('r')) parser.add_argument('-slang', help='source language', default='en') parser.add_argument('-tlang', help='target language', default='fr') args = parser.parse_args() train_pairs = read_url_pairs(args.train) source_urls = set(su for su, tu in train_pairs) target_urls = set(tu for su, tu in train_pairs) assert len(source_urls) == len(target_urls) for lett_file in args.lettfiles: for page in read_lett_iter(lett_file): if page.lang == args.slang: assert page.url not in target_urls, \ "%s is in %s and cannot be a target url" % ( page.url, page.lang) if page.url in source_urls: source_urls.remove(page.url) elif page.lang == args.tlang: assert page.url not in source_urls, \ "%s is in %s and cannot be a source url" % ( page.url, page.lang) if page.url in target_urls: target_urls.remove(page.url) else: # ignore all other languages pass