def word2vec (conf): logger = LoggingUtil.init_logging (__file__) logger.info ("Creating Chemotext2 word embeddings from input: {0}".format (conf.input_dir)) sc = SparkUtil.get_spark_context (conf.spark_conf) article_paths = SUtil.get_article_paths (conf.input_dir) #[:20000] articles = sc.parallelize (article_paths, conf.spark_conf.parts). \ map (lambda p : SUtil.get_article (p)) logger.info ("Listed {0} input files".format (articles.count ())) conf.output_dir = conf.output_dir.replace ("file:", "") conf.output_dir = "file://{0}/w2v".format (conf.output_dir) return WordEmbed (sc, conf.output_dir, articles)
def __iter__(self): for file_name in self.files: if self.match (file_name): base = "{0}.json".format (os.path.basename (file_name)) article_path = os.path.join(self.input_dir, base) article = SUtil.get_article (article_path) if article is not None: # http://www.nltk.org/api/nltk.tokenize.html#module-nltk.tokenize sentence_tokens = [ self.tokenizer.tokenize(s) for s in sent_tokenize (article.raw) ] sentences = [ self.gene_syn.make_canonical (s) for s in sentence_tokens ] #print (" sentences------------> {0}".format (sentences)) #sentences = [ self.tokenizer.tokenize(s) for p in article.paragraphs for s in p.sentences ] #sentences = [ s.split(' ') for p in article.paragraphs for s in p.sentences ] for s in sentences: yield s
def get_article (article_path): article = EquivalentSet.get_article_equiv_set (SUtil.get_article (article_path)) return [] if not article else [ article ]