示例#1
0
 def word2vec (conf):
     logger = LoggingUtil.init_logging (__file__)
     logger.info ("Creating Chemotext2 word embeddings from input: {0}".format (conf.input_dir))
     sc = SparkUtil.get_spark_context (conf.spark_conf)
     article_paths = SUtil.get_article_paths (conf.input_dir) #[:20000]
     articles = sc.parallelize (article_paths, conf.spark_conf.parts). \
                map (lambda p : SUtil.get_article (p))
     logger.info ("Listed {0} input files".format (articles.count ()))
     
     conf.output_dir = conf.output_dir.replace ("file:", "")
     conf.output_dir = "file://{0}/w2v".format (conf.output_dir)
     return WordEmbed (sc, conf.output_dir, articles)
示例#2
0
 def __iter__(self):
     for file_name in self.files:
         if self.match (file_name):
             base = "{0}.json".format (os.path.basename (file_name))
             article_path = os.path.join(self.input_dir, base)
             article = SUtil.get_article (article_path)
             if article is not None:
                 # http://www.nltk.org/api/nltk.tokenize.html#module-nltk.tokenize
                 sentence_tokens = [ self.tokenizer.tokenize(s) for s in sent_tokenize (article.raw) ]
                 sentences = [ self.gene_syn.make_canonical (s) for s in sentence_tokens ]
                 #print (" sentences------------> {0}".format (sentences))
                 #sentences = [ self.tokenizer.tokenize(s) for p in article.paragraphs for s in p.sentences ]
                 #sentences = [ s.split(' ') for p in article.paragraphs for s in p.sentences ]
                 for s in sentences:
                     yield s
示例#3
0
 def get_article (article_path):
     article = EquivalentSet.get_article_equiv_set (SUtil.get_article (article_path))
     return [] if not article else [ article ]