class TranslatorInterface(): """An interface to a single, possibly multilingual, model.""" def __init__(self, srclang, targetlang, service, model): self.service = service self.contentprocessor = ContentProcessor( srclang, targetlang, sourcebpe=self.service.get('sourcebpe'), targetbpe=self.service.get('targetbpe'), sourcespm=self.service.get('sourcespm'), targetspm=self.service.get('targetspm')) self.worker = model # becomes nonempty if there are multiple target languages self.preamble = "" def translate(self, text): sentences = self.contentprocessor.preprocess(text) translatedSentences = self.worker.translate(self.preamble + '\n'.join(sentences)) translation = self.contentprocessor.postprocess(translatedSentences) return ' '.join(translation) def ready(self): return self.worker != None and self.worker.ready() def on_exit(self): if self.worker != None: self.worker.on_exit()
class TranslatorWorker(): def __init__(self, srclang, targetlang, service): self.q = queues.Queue() # Service definition self.service = service self.p = None self.contentprocessor = ContentProcessor( srclang, targetlang, sourcebpe=self.service.get('sourcebpe'), targetbpe=self.service.get('targetbpe'), sourcespm=self.service.get('sourcespm'), targetspm=self.service.get('targetspm')) self.ws_url = "ws://{}:{}/translate".format(self.service['host'], self.service['port']) if self.service['configuration']: self.run() @gen.coroutine def run(self): process.Subprocess.initialize() self.p = process.Subprocess([ 'marian-server', '-c', self.service['configuration'], '-p', self.service['port'], '--allow-unk', # enables translation with a mini-batch size of 64, i.e. translating 64 sentences at once, with a beam-size of 6. '-b', '6', '--mini-batch', '64', # use a length-normalization weight of 0.6 (this usually increases BLEU a bit). '--normalize', '0.6', '--maxi-batch-sort', 'src', '--maxi-batch', '100', ]) self.p.set_exit_callback(self.on_exit) ret = yield self.p.wait_for_exit() def on_exit(self): print("Process exited") def translate(self, srctxt): ws = websocket.create_connection(self.ws_url) sentences = self.contentprocessor.preprocess(srctxt) ws.send('\n'.join(sentences)) translatedSentences = ws.recv().split('\n') ws.close() translation = self.contentprocessor.postprocess(translatedSentences) return ' '.join(translation)
def __init__(self, srclang, targetlang, service, model): self.service = service self.contentprocessor = ContentProcessor( srclang, targetlang, sourcebpe=self.service.get('sourcebpe'), targetbpe=self.service.get('targetbpe'), sourcespm=self.service.get('sourcespm'), targetspm=self.service.get('targetspm')) self.worker = model # becomes nonempty if there are multiple target languages self.preamble = ""
def __init__(self, srclang, targetlang, service): self.q = queues.Queue() # Service definition self.service = service self.p = None self.contentprocessor = ContentProcessor( srclang, targetlang, sourcebpe=self.service.get('sourcebpe'), targetbpe=self.service.get('targetbpe'), sourcespm=self.service.get('sourcespm'), targetspm=self.service.get('targetspm')) self.ws_url = "ws://{}:{}/translate".format(self.service['host'], self.service['port']) if self.service['configuration']: self.run()
class GeneralSpider(Spider): name = 'general' def __init__(self, *args, **kwargs): super(GeneralSpider, self).__init__(*args, **kwargs) self.content_processor = ContentProcessor() @classmethod def from_crawler(cls, crawler, *args, **kwargs): spider = cls(*args, **kwargs) spider._set_crawler(crawler) spider.crawler.signals.connect(spider.spider_idle, signal=signals.spider_idle) return spider def spider_idle(self): self.log("Spider idle signal caught.") raise DontCloseSpider def parse(self, response): if not isinstance(response, HtmlResponse): return pc = self.content_processor.process_response(response) for link in pc.links: r = Request(url=link.url) r.meta.update(link_text=link.text) yield r
class TranslatorWorker(): def __init__(self, srclang, targetlang, service): self.q = queues.Queue() # Service definition self.service = service self.p = None self.contentprocessor = ContentProcessor( srclang, targetlang, sourcebpe=self.service.get('sourcebpe'), targetbpe=self.service.get('targetbpe'), sourcespm=self.service.get('sourcespm'), targetspm=self.service.get('targetspm') ) self.ws_url = "ws://{}:{}/translate".format( self.service['host'], self.service['port']) if self.service['configuration']: self.run() @gen.coroutine def run(self): process.Subprocess.initialize() self.p = process.Subprocess(['marian-server', '-c', self.service['configuration'], '--quiet-translation', '-p', self.service['port']]) self.p.set_exit_callback(self.on_exit) ret = yield self.p.wait_for_exit() def on_exit(self): print("Process exited") def translate(self, srctxt): ws = websocket.create_connection(self.ws_url) sentences = self.contentprocessor.preprocess(srctxt) translatedSentences = [] for sentence in sentences: ws.send(sentence) translatedSentences.append(ws.recv()) ws.close() translation = self.contentprocessor.postprocess(translatedSentences) return ' '.join(translation)
x = x.strip() y = y.strip() z = z.strip() pairs.append((x, y, z)) # Filer out the sentences with less than 5 tokens or larger than 120 tokens for i in range(len(pairs) - 1, -1, -1): if len(pairs[i][0].split()) > 120 or len(pairs[i][0].split()) <= 4: pairs.pop(i) # Load preprocessor services = {} with open("service.json", 'r') as configfile: services = json.load(configfile) config = services[src][trg] contentprocessor = ContentProcessor(src, trg, sourcebpe=config.get('sourcebpe'), targetbpe=config.get('targetbpe'), sourcespm=config.get('sourcespm'), targetspm=config.get('targetspm')) with open("{}_en_pairs.csv".format(src), "a", newline='') as datacsv: csvwriter = csv.writer(datacsv, dialect=("excel")) csvwriter.writerow(["score", src, "en"]) for s, t, score in pairs: csvwriter.writerow([score, s, t]) sentences = [contentprocessor.preprocess(pair[0]) for pair in pairs] with open('input_{}.txt'.format(src), 'w') as f: for _list in sentences: for _string in _list: f.write(_string + ' ') f.write('\n')
from query import CrawlerDb from content_processor import ContentProcessor from settings import LOGGING import sys, urlparse, urllib2, shutil, glob, robotparser import logging, logging.config import traceback # ===== Init stuff ===== # db init cdb = CrawlerDb() cdb.connect() # content processor init processor = ContentProcessor(None, None, None) # logging setup logging.config.dictConfig(LOGGING) logger = logging.getLogger("crawler_logger") # robot parser init robot = robotparser.RobotFileParser() if len(sys.argv) < 2: logger.info("Error: No start url was passed") sys.exit() l = sys.argv[1:] cdb.enqueue(l)
def __init__(self, *args, **kwargs): super(GeneralSpider, self).__init__(*args, **kwargs) self.content_processor = ContentProcessor()