示例#1
0
    def augment (conf):
        ''' For now, this is a place to extend the data in the CSV with ...
        * Word2Vec derived cosine similarity
        * Second derivative of the frequency of mentions
        '''
        sc = SparkUtil.get_spark_context (conf.spark_conf)
        conf.output_dir = conf.output_dir.replace ("file:", "")
        conf.output_dir = "file://{0}".format (conf.output_dir)

        groups = Evaluate.load_all (sc, conf).                          \
                 map         (lambda b    : ( simplekey(b), [ b ] ) ).  \
                 reduceByKey (lambda x, y : x + y).                     \
                 mapValues   (lambda b    : freq_derivative (b)).       \
                 flatMap     (lambda x    : x[1])
        
        '''
        for k, v in groups.collect ():
            for b in v:
                print (" frequency second derivative {0} => {1}".format (k, b.freq_sec_deriv))
        '''
        groups = groups.coalesce (1)
        output_file = os.path.join (conf.output_dir, "eval", "augment.csv")
        no_proto_output_file = output_file.replace ("file://", "")
        if os.path.exists (no_proto_output_file):
            print ("removing existing output file")
            shutil.rmtree (no_proto_output_file)
        groups.map(to_csv_row).saveAsTextFile (output_file)
示例#2
0
def main ():
    """
    Annotate model files with word embedding computed cosine similarity.
    """
    parser = argparse.ArgumentParser()
    parser.add_argument("--host",   help="Mesos master host")
    parser.add_argument("--name",   help="Spark framework name")
    parser.add_argument("--input",  help="Output directory for a Chemotext2 run.")
    parser.add_argument("--output", help="Output directory for evaluation.")
    parser.add_argument("--slices", help="Number of separate work chunks.")
    parser.add_argument("--parts",  help="Number of partitions for the computation.")
    parser.add_argument("--venv",   help="Path to Python virtual environment to use")
    args = parser.parse_args()

    conf = EvaluateConf (
        spark_conf = SparkConf (host           = args.host,
                                venv           = args.venv,
                                framework_name = args.name,
                                parts          = int(args.parts)),
        input_dir      = args.input.replace ("file://", ""),
        output_dir     = args.output.replace ("file://", ""),
        slices         = int(args.slices))

    print ("Data home: {0}".format (args.input))

    sc = SparkUtil.get_spark_context (conf.spark_conf)
    process_graphs (sc, conf.input_dir, conf.spark_conf.parts)
示例#3
0
def main ():
    """
    Tools for running word2vec on the corpus.
    """
    parser = argparse.ArgumentParser()
    parser.add_argument("--host",   help="Mesos master host")
    parser.add_argument("--name",   help="Spark framework name")
    parser.add_argument("--input",  help="Output directory for a Chemotext2 run.")
    parser.add_argument("--output", help="Output directory for evaluation.")
    parser.add_argument("--slices", help="Number of slices of files to iterate over.")
    parser.add_argument("--parts",  help="Number of partitions for the computation.")
    parser.add_argument("--venv",   help="Path to Python virtual environment to use")

    args = parser.parse_args()
    conf = EvaluateConf (
        spark_conf = SparkConf (host           = args.host,
                                venv           = args.venv,
                                framework_name = args.name,
                                parts          = int(args.parts)),
        input_dir      = args.input.replace ("file://", ""),
        output_dir     = args.output.replace ("file://", ""),
        slices         = int(args.slices))

    root = os.path.dirname (conf.input_dir)
    model_dir = os.path.join (root, "w2v", "gensim")
    sc = SparkUtil.get_spark_context (conf.spark_conf)
    file_list = "/projects/stars/app/chemotext/filelist.json"

#    hgnc = os.path.join (os.path.basedir (conf.spark_conf.otput_dir), "HGNC", "HGNCGeneFamilyDataSet.csv")
    hgnc = os.path.join (conf.output_dir, "HGNC", "HGNCGeneSynonyms.csv")
    print ("hgnc: {0}".format (hgnc))

    build_all_models (sc, conf.input_dir, file_list, model_dir, hgnc)
示例#4
0
 def word2vec (conf):
     logger = LoggingUtil.init_logging (__file__)
     logger.info ("Creating Chemotext2 word embeddings from input: {0}".format (conf.input_dir))
     sc = SparkUtil.get_spark_context (conf.spark_conf)
     article_paths = SUtil.get_article_paths (conf.input_dir) #[:20000]
     articles = sc.parallelize (article_paths, conf.spark_conf.parts). \
                map (lambda p : SUtil.get_article (p))
     logger.info ("Listed {0} input files".format (articles.count ()))
     
     conf.output_dir = conf.output_dir.replace ("file:", "")
     conf.output_dir = "file://{0}/w2v".format (conf.output_dir)
     return WordEmbed (sc, conf.output_dir, articles)
示例#5
0
def analyze_medline (conf):
    logger.info ("conf: {0}".format (conf))
    sc = SparkUtil.get_spark_context (conf)
    #medline_conn = Medline (sc, conf.input_xml, use_mem_cache=True)
    # create_pmid_map (medline_conn)

    start = time.time()
    sqlContext = SQLContext (sc)
    #p = sqlContext.jsonFile("alluxio://stars-c0.edc.renci.org:19998/chemotext/pmid/pmid_date_2.json").collectAsMap ()
    with open ("/projects/stars/var/chemotext/pmid/pmid_date_2.json", "r") as stream:
        p = json.loads (stream.read ())
    elapsed = time.time() - start
    print ("TIME(load): ------------> {0}".format (elapsed))

    broadcastPMID = sc.broadcast (p)

    spots = sc.parallelize ([ 1, 2, 3 ])
    times = spots.map (lambda s : broadcastPMID.value [str(s)])
    print times.collect ()
示例#6
0
def execute (conf, home):
    sc = SparkUtil.get_spark_context (conf.spark_conf)

    data_lake = DataLake (sc, conf.data_lake_conf)
    kin2prot = data_lake.get_kin2prot ()
    articles = data_lake.load_articles ()
    vocabulary = data_lake.load_vocabulary (kin2prot)
    pmid_date = data_lake.load_pmid_date () # ( pmid -> date )

    binaries = LitCrawl.find_interactions (sc, vocabulary, articles)
    facts = LitCrawl.find_facts (vocabulary, binaries)
    before = LitCrawl.find_before (pmid_date, facts)

    for m in before.collect ():
        logger.info ("Before-Ref-Date:> {0}".format (m))

    embed = WordEmbed (sc, conf, articles)
    for w in vocabulary.A.collect ():
        for syn in embed.find_syn (w, radius=800):
            if "kinase" in syn or "p53" in syn:
                print "   --[ {0} ]:syn>> {1}".format (w, syn)
示例#7
0
    def plot (conf):
        sc = SparkUtil.get_spark_context (conf.spark_conf)
        sqlContext = SQLContext(sc) # object unused but defines toDF()
        print ("Original: Output dir: {0}".format (conf.output_dir))
        conf.output_dir = conf.output_dir.replace ("file:", "")
        conf.output_dir = "file://{0}/eval".format (conf.output_dir)

        print ("Output dir: {0}".format (conf.output_dir))

        annotated = Evaluate.load_all (sc, conf) #.sample (False, 0.02)
        before = annotated. \
                 map         (lambda b : ( simplekey(b), [ b ] ) ). \
                 reduceByKey (lambda x,y : x + y). \
                 mapValues   (lambda x : filter (lambda v : v is not None, x))

        print ("Got {0} before values".format (before.count ()))

        plot_path = conf.output_dir.replace ("file://", "")
        print ("Generating plots to plot path: {0}".format (plot_path))
        Evaluate.plot_before (before, plot_path)
        before = None

        distances = annotated.map (lambda x : ( x.fact, x.docDist, x.paraDist, x.sentDist) )
        Evaluate.plot_distances (distances)
示例#8
0
    def train_model (conf):
        sc = SparkUtil.get_spark_context (conf.spark_conf)
        conf.output_dir = conf.output_dir.replace ("file:", "")
        conf.output_dir = "file://{0}".format (conf.output_dir)

        labeled = Evaluate.load_all (sc, conf). \
                  map (lambda b : LabeledPoint ( label = 1.0 if b.fact else 0.0,
                                                 features = [ b.paraDist, b.sentDist, b.docDist ] ) )

#        labeled = sc.parallelize ([ round ((x/10) * 9) for x in random.sample(range(1, 100000000), 30000) ]). \
#                  map (lambda b : LabeledPoint ( 1.0 if b % 2 == 0 else 0.0,
#                                                 [ b, b * 2, b * 9 ] ) )
#        print (labeled.collect ())

        train, test = labeled.randomSplit (weights=[ 0.8, 0.2 ], seed=12345)

        count = train.count ()
        start = time.time ()
        model = LogisticRegressionWithLBFGS.train (train)
        elapsed = time.time () - start
        print ("Trained model on training set of size {0} in {1} seconds".format (count, elapsed))

        start = time.time ()
        model_path = os.path.join (conf.output_dir, "eval", "model")
        file_path = model_path.replace ("file://", "")
        if os.path.isdir (file_path):
            print ("Removing existing model {0}".format (file_path))
            shutil.rmtree (file_path)
        model.save(sc, model_path)
        sameModel = LogisticRegressionModel.load(sc, model_path)
        elapsed = time.time () - start
        print ("Saved and restored model to {0} in {1} seconds".format (model_path, elapsed))


        # Metrics
        labelsAndPreds = test.map (lambda p: (p.label, model.predict (p.features)))
        trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count () / float (train.count())
        print("Training Error => {0}".format (trainErr))

        predictionsAndLabels = labelsAndPreds.map (lambda x : ( float(x[1]), float(x[0]) ))
        metrics = MulticlassMetrics (predictionsAndLabels) 
        print (" --------------> {0}".format (predictionsAndLabels.take (1000)))

        #print (labelsAndPreds.collect ())
        print ("\nMETRICS:")
        try:
            print ("false positive (0.0): {0}".format (metrics.falsePositiveRate(0.0)))
            print ("false positive (1.0): {0}".format (metrics.falsePositiveRate(1.0)))
        except:
            traceback.print_exc ()
        try:
            print ("precision          : {0}".format (metrics.precision(1.0)))
        except:
            traceback.print_exc ()
        try:
            print ("recall             : {0}".format (metrics.recall(1.0)))
        except:
            traceback.print_exc ()
        try:
            print ("fMeasure           : {0}".format (metrics.fMeasure(0.0, 2.0)))
        except:
            traceback.print_exc ()

        print ("confusion matrix   : {0}".format (metrics.confusionMatrix().toArray ()))
        print ("precision          : {0}".format (metrics.precision()))
        print ("recall             : {0}".format (metrics.recall()))
        print ("weighted false pos : {0}".format (metrics.weightedFalsePositiveRate))
        print ("weighted precision : {0}".format (metrics.weightedPrecision))
        print ("weighted recall    : {0}".format (metrics.weightedRecall))
        print ("weight f measure   : {0}".format (metrics.weightedFMeasure()))
        print ("weight f measure 2 : {0}".format (metrics.weightedFMeasure(2.0)))
        print ("")

        # Regression metrics
        predictedAndObserved = test.map (lambda p: (model.predict (p.features) / 1.0 , p.label / 1.0 ) )

        regression_metrics = RegressionMetrics (predictedAndObserved)
        print ("explained variance......: {0}".format (regression_metrics.explainedVariance))
        print ("absolute error..........: {0}".format (regression_metrics.meanAbsoluteError))
        print ("mean squared error......: {0}".format (regression_metrics.meanSquaredError))
        print ("root mean squared error.: {0}".format (regression_metrics.rootMeanSquaredError))
        print ("r2......................: {0}".format (regression_metrics.r2))
        print ("")

        labelsAndPreds = test.map (lambda p: (p.label, sameModel.predict (p.features)))
        testErr = labelsAndPreds.filter (lambda (v, p): v != p).count () / float (test.count ())
        print ("Testing Error => {0}".format (testErr))
示例#9
0
    def evaluate (conf):
        logger = LoggingUtil.init_logging (__file__)
        logger.info ("Evaluating Chemotext2 output: {0}".format (conf.input_dir))
        sc = SparkUtil.get_spark_context (conf.spark_conf)
        facts = Facts.get_facts (sc, conf.ctd_conf)
        pathway_facts = Facts.get_pathway_facts (sc, conf.ctd_conf)
        logger.info ("Loaded {0} facts".format (facts.count ()))
        articles = SUtil.get_article_paths (conf.input_dir) #[:200]
        logger.info ("Listed {0} input files".format (len(articles)))
        for slice_n in range (0, conf.slices):
            output_dir = os.path.join (conf.output_dir, "eval", "annotated", str(slice_n))
            if os.path.exists (output_dir):
                logger.info ("Skipping existing directory {0}".format (output_dir))
            else:
                logger.info ("Loading guesses")
                start = time.time ()
                guesses, article_pmids = Guesses.get_guesses (sc,
                                                              conf.input_dir,
                                                              conf.spark_conf.parts,
                                                              articles,
                                                              conf.slices,
                                                          slice_n)
                elapsed = round (time.time () - start, 2)
                count = guesses.count ()
                logger.info ("Guesses[slice {0}]. {1} binaries in {2} seconds.".format (slice_n, count, elapsed))
                
                pmids = sc.broadcast (article_pmids)

                start = time.time ()
                pmid_date_map = None
                pmid_map_path = os.path.join ( os.path.dirname (conf.input_dir), "pmid", "pmid_date_2.json")
                # /projects/stars/var/chemotext/pmid/pmid_date_2.json

                print ("Loading pmid date map: {0}".format (pmid_map_path))
                with open (pmid_map_path, "r") as stream:
                    pmid_date_map = json.loads (stream.read ())
                elapsed = round (time.time () - start, 2)
                print ("Read pmid date map in {0} seconds".format (elapsed))

                if pmid_date_map is None:
                    print ("Unable to load pmid date map")
                else:
                    start = time.time ()
                    pmid_date_map_broadcast = sc.broadcast (pmid_date_map)
                    annotated = Guesses.annotate (guesses, facts, pathway_facts, pmids, pmid_date_map_broadcast).cache ()
                    count = annotated.count ()
                    elapsed = round (time.time () - start, 2)
                
                logger.info ("Annotation[slice {0}]. {1} binaries in {2} seconds.".format (slice_n, count, elapsed))
                logger.info ("Generating annotated output for " + output_dir)
                os.makedirs (output_dir)


                train = annotated. \
                        filter (lambda b : b is not None and is_training (b))
                train.count ()

                train = train.map (lambda b : json.dumps (b, cls=BinaryEncoder))
                train.count ()
                train_out_dir = os.path.join (output_dir, 'train')
                train.saveAsTextFile ("file://" + train_out_dir)
                print ("   --> train: {0}".format (train_out_dir))
                
                test  = annotated. \
                        filter (lambda b : b is not None and not is_training (b)).\
                        map (lambda b : json.dumps (b, cls=BinaryEncoder))
                test_out_dir = os.path.join (output_dir, 'test')
                test.saveAsTextFile ("file://" + test_out_dir)
                print ("   --> test: {0}".format (test_out_dir))

                ''' Save CSV '''
                csv_output = "file://{0}".format (os.path.join (output_dir, "csv"))                
                annotated. \
                    map (to_csv_row). \
                    saveAsTextFile (csv_output)
                print ("   --> csv: {0}".format (csv_output))

        ''' Concatenate all csvs into one big one '''
        csv_dirs = os.path.join (conf.output_dir, "eval", "annotated")
        print ("scanning {0}".format (csv_dirs))
        csv_files = []
        for root, dirnames, filenames in os.walk (csv_dirs):
            for filename in fnmatch.filter(filenames, '*part-*'):
                if not "crc" in filename and "csv" in root:
                    file_name = os.path.join(root, filename)
                    csv_files.append (file_name)
        big_csv = os.path.join (conf.output_dir, "eval", "eval.csv")
        
        with open (big_csv, "w") as stream:
            stream.write ("#pubmed_id,pubmed_date_unix_epoch_time,pubmed_date_human_readable,binary_a_term,binary_b_term,paragraph_distance,sentence_distance,word_distance,flag_if_valid,time_until_verified,freq_sec_deriv\n")
            for f in csv_files:
                with open (f, "r") as in_csv:
                    for line in in_csv:
                        stream.write(line)