def machinelearnig(): data1 = sc.textFile("/user/SentimentalData/data1.csv") spark = getSparkSessionInstance(data1.context.getConf()) r1 = data1.mapPartitions(lambda x : csv.reader(x)).map(lambda x: [x[3], x[1]]) rddr1 = r1.map(lambda x: [x[0].replace(',',' '),x[1]]).map(lambda x: [x[0].replace('/',' '),x[1]]).map(lambda x: [x[0].replace('?',' '),x[1]]) rddr1 = rddr1.map(lambda x: [x[0].replace(':',' '),x[1]]).map(lambda x: [x[0].replace('-',' '),x[1]]).map(lambda x: [x[0].replace('.',' '),x[1]]) rddr1 = rddr1.map(lambda x: [x[0].replace(')',' '),x[1]]).map(lambda x: [x[0].replace('(',' '),x[1]]).map(lambda x: [x[0].replace('!',' '),x[1]]) rddr1 = rddr1.map(lambda sn: [' '.join(filter(lambda x: x.startswith(('@','http','"','&','rt')) == False, sn[0].split() )), sn[1]] ) rddr1 = rddr1.map(lambda sn: [' '.join(filter(lambda x: (x.endswith(('"')) == False) and len(x) > 2 , sn[0].split() )), sn[1]] ) rddr1 = rddr1.map(lambda sn: [' '.join(filter(lambda x: "a's" != x and x != "able" and x != "about" and x != "above" and x != "according" and x != "accordingly" and x != "across" and x != "actually" and x != "after" and x != "afterwards" and x != "again" and x != "against" and x != "ain't" and x != "all" and x != "allow" and x != "allows" and x != "almost" and x != "alone" and x != "along" and x != "already" and x != "also" and x != "although" and x != "always" and x != "am" and x != "among" and x != "amongst" and x != "an" and x != "and" and x != "another" and x != "any" and x != "anybody" and x != "anyhow" and x != "anyone" and x != "anything" and x != "anyway" and x != "anyways" and x != "anywhere" and x != "apart" and x != "appear" and x != "appreciate" and x != "appropriate" and x != "are" and x != "aren't" and x != "around" and x != "as" and x != "aside" and x != "ask" and x != "asking" and x != "associated" and x != "at" and x != "available" and x != "away" and x != "awfully" and x != "b" and x != "be" and x != "became" and x != "because" and x != "become" and x != "becomes" and x != "becoming" and x != "been" and x != "before" and x != "beforehand" and x != "behind" and x != "being" and x != "believe" and x != "below" and x != "beside" and x != "besides" and x != "best" and x != "better" and x != "between" and x != "beyond" and x != "both" and x != "brief" and x != "but" and x != "by" and x != "c" and x != "c'mon" and x != "c's" and x != "came" and x != "can" and x != "can't" and x != "cannot" and x != "cant" and x != "cause" and x != "causes" and x != "certain" and x != "certainly" and x != "changes" and x != "clearly" and x != "co" and x != "com" and x != "come" and x != "comes" and x != "concerning" and x != "consequently" and x != "consider" and x != "considering" and x != "contain" and x != "containing" and x != "contains" and x != "corresponding" and x != "could" and x != "couldn't" and x != "course" and x != "currently" and x != "d" and x != "definitely" and x != "described" and x != "despite" and x != "did" and x != "didn't" and x != "different" and x != "do" and x != "does" and x != "doesn't" and x != "doing" and x != "don't" and x != "done" and x != "down" and x != "downwards" and x != "during" and x != "e" and x != "each" and x != "edu" and x != "eg" and x != "eight" and x != "either" and x != "else" and x != "elsewhere" and x != "enough" and x != "entirely" and x != "especially" and x != "et" and x != "etc" and x != "even" and x != "ever" and x != "every" and x != "everybody" and x != "everyone" and x != "everything" and x != "everywhere" and x != "ex" and x != "exactly" and x != "example" and x != "except" and x != "f" and x != "far" and x != "few" and x != "fifth" and x != "first" and x != "five" and x != "followed" and x != "following" and x != "follows" and x != "for" and x != "former" and x != "formerly" and x != "forth" and x != "four" and x != "from" and x != "further" and x != "furthermore" and x != "g" and x != "get" and x != "gets" and x != "getting" and x != "given" and x != "gives" and x != "go" and x != "goes" and x != "going" and x != "gone" and x != "got" and x != "gotten" and x != "greetings" and x != "h" and x != "had" and x != "hadn't" and x != "happens" and x != "hardly" and x != "has" and x != "hasn't" and x != "have" and x != "haven't" and x != "having" and x != "he" and x != "he's" and x != "hello" and x != "help" and x != "hence" and x != "her" and x != "here" and x != "here's" and x != "hereafter" and x != "hereby" and x != "herein" and x != "hereupon" and x != "hers" and x != "herself" and x != "hi" and x != "him" and x != "himself" and x != "his" and x != "hither" and x != "hopefully" and x != "how" and x != "howbeit" and x != "however" and x != "i" and x != "i'd" and x != "i'll" and x != "i'm" and x != "i've" and x != "ie" and x != "if" and x != "ignored" and x != "immediate" and x != "in" and x != "inasmuch" and x != "inc" and x != "indeed" and x != "indicate" and x != "indicated" and x != "indicates" and x != "inner" and x != "insofar" and x != "instead" and x != "into" and x != "inward" and x != "is" and x != "isn't" and x != "it" and x != "it'd" and x != "it'll" and x != "it's" and x != "its" and x != "itself" and x != "j" and x != "just" and x != "k" and x != "keep" and x != "keeps" and x != "kept" and x != "know" and x != "known" and x != "knows" and x != "l" and x != "last" and x != "lately" and x != "later" and x != "latter" and x != "latterly" and x != "least" and x != "less" and x != "lest" and x != "let" and x != "let's" and x != "like" and x != "liked" and x != "likely" and x != "little" and x != "look" and x != "looking" and x != "looks" and x != "ltd" and x != "m" and x != "mainly" and x != "many" and x != "may" and x != "maybe" and x != "me" and x != "mean" and x != "meanwhile" and x != "merely" and x != "might" and x != "more" and x != "moreover" and x != "most" and x != "mostly" and x != "much" and x != "must" and x != "my" and x != "myself" and x != "n" and x != "name" and x != "namely" and x != "nd" and x != "near" and x != "nearly" and x != "necessary" and x != "need" and x != "needs" and x != "neither" and x != "never" and x != "nevertheless" and x != "new" and x != "next" and x != "nine" and x != "no" and x != "nobody" and x != "non" and x != "none" and x != "noone" and x != "nor" and x != "normally" and x != "not" and x != "nothing" and x != "novel" and x != "now" and x != "nowhere" and x != "o" and x != "obviously" and x != "of" and x != "off" and x != "often" and x != "oh" and x != "ok" and x != "okay" and x != "old" and x != "on" and x != "once" and x != "one" and x != "ones" and x != "only" and x != "onto" and x != "or" and x != "other" and x != "others" and x != "otherwise" and x != "ought" and x != "our" and x != "ours" and x != "ourselves" and x != "out" and x != "outside" and x != "over" and x != "overall" and x != "own" and x != "p" and x != "particular" and x != "particularly" and x != "per" and x != "perhaps" and x != "placed" and x != "please" and x != "plus" and x != "possible" and x != "presumably" and x != "probably" and x != "provides" and x != "q" and x != "que" and x != "quite" and x != "qv" and x != "r" and x != "rather" and x != "rd" and x != "re" and x != "really" and x != "reasonably" and x != "regarding" and x != "regardless" and x != "regards" and x != "relatively" and x != "respectively" and x != "right" and x != "s" and x != "said" and x != "same" and x != "saw" and x != "say" and x != "saying" and x != "says" and x != "second" and x != "secondly" and x != "see" and x != "seeing" and x != "seem" and x != "seemed" and x != "seeming" and x != "seems" and x != "seen" and x != "self" and x != "selves" and x != "sensible" and x != "sent" and x != "serious" and x != "seriously" and x != "seven" and x != "several" and x != "shall" and x != "she" and x != "should" and x != "shouldn't" and x != "since" and x != "six" and x != "so" and x != "some" and x != "somebody" and x != "somehow" and x != "someone" and x != "something" and x != "sometime" and x != "sometimes" and x != "somewhat" and x != "somewhere" and x != "soon" and x != "sorry" and x != "specified" and x != "specify" and x != "specifying" and x != "still" and x != "sub" and x != "such" and x != "sup" and x != "sure" and x != "t" and x != "t's" and x != "take" and x != "taken" and x != "tell" and x != "tends" and x != "th" and x != "than" and x != "thank" and x != "thanks" and x != "thanx" and x != "that" and x != "that's" and x != "thats" and x != "the" and x != "their" and x != "theirs" and x != "them" and x != "themselves" and x != "then" and x != "thence" and x != "there" and x != "there's" and x != "thereafter" and x != "thereby" and x != "therefore" and x != "therein" and x != "theres" and x != "thereupon" and x != "these" and x != "they" and x != "they'd" and x != "they'll" and x != "they're" and x != "they've" and x != "think" and x != "third" and x != "this" and x != "thorough" and x != "thoroughly" and x != "those" and x != "though" and x != "three" and x != "through" and x != "throughout" and x != "thru" and x != "thus" and x != "to" and x != "together" and x != "too" and x != "took" and x != "toward" and x != "towards" and x != "tried" and x != "tries" and x != "truly" and x != "try" and x != "trying" and x != "twice" and x != "two" and x != "u" and x != "un" and x != "under" and x != "unfortunately" and x != "unless" and x != "unlikely" and x != "until" and x != "unto" and x != "up" and x != "upon" and x != "us" and x != "use" and x != "used" and x != "useful" and x != "uses" and x != "using" and x != "usually" and x != "uucp" and x != "v" and x != "value" and x != "various" and x != "very" and x != "via" and x != "viz" and x != "vs" and x != "w" and x != "want" and x != "wants" and x != "was" and x != "wasn't" and x != "way" and x != "we" and x != "we'd" and x != "we'll" and x != "we're" and x != "we've" and x != "welcome" and x != "well" and x != "went" and x != "were" and x != "weren't" and x != "what" and x != "what's" and x != "whatever" and x != "when" and x != "whence" and x != "whenever" and x != "where" and x != "where's" and x != "whereafter" and x != "whereas" and x != "whereby" and x != "wherein" and x != "whereupon" and x != "wherever" and x != "whether" and x != "which" and x != "while" and x != "whither" and x != "who" and x != "who's" and x != "whoever" and x != "whole" and x != "whom" and x != "whose" and x != "why" and x != "will" and x != "willing" and x != "wish" and x != "with" and x != "within" and x != "without" and x != "won't" and x != "wonder" and x != "would" and x != "wouldn't" and x != "x" and x != "y" and x != "yes" and x != "yet" and x != "you" and x != "you'd" and x != "you'll" and x != "you're" and x != "you've" and x != "your" and x != "yours" and x != "yourself" and x != "yourselves" and x != "z" and x != "zero" , sn[0].split() )), sn[1]] ) partsDF1 = spark.createDataFrame( rddr1.map(lambda x : Row(sentence=str.strip(x[0]), label=int(x[1]))) ) tokenizer = Tokenizer(inputCol="sentence", outputCol="words") tokenized1 = tokenizer.transform(partsDF1) remover = StopWordsRemover(inputCol="words", outputCol="base_words") base_words1 = remover.transform(tokenized1) train_data_raw1 = base_words1.select("base_words", "label") word2Vec1 = Word2Vec(vectorSize=3, minCount=0, inputCol="base_words", outputCol="features") model1 = word2Vec1.fit(train_data_raw1) final_train_data1 = model1.transform(train_data_raw1) final_train_data1 = final_train_data1.select("label", "features") lr = LogisticRegression(maxIter=2000, regParam=0.001, elasticNetParam=0.0001) lrModel = lr.fit(final_train_data1) lrModel.transform(final_train_data1) data2 = sc.textFile("/user/ml/data2.csv") r2 = data1.mapPartitions(lambda x : csv.reader(x)) r2 = data2.mapPartitions(lambda x : csv.reader(x)).map(lambda x: [x[3], x[1]]) rddr2 = r2.map(lambda x: [x[0].replace(',',' '),x[1]]).map(lambda x: [x[0].replace('/',' '),x[1]]).map(lambda x: [x[0].replace('?',' '),x[1]]) rddr2 = rddr2.map(lambda x: [x[0].replace(':',' '),x[1]]).map(lambda x: [x[0].replace('-',' '),x[1]]).map(lambda x: [x[0].replace('.',' '),x[1]]) rddr2 = rddr2.map(lambda x: [x[0].replace(')',' '),x[1]]).map(lambda x: [x[0].replace('(',' '),x[1]]).map(lambda x: [x[0].replace('!',' '),x[1]]) rddr2 = rddr2.map(lambda sn: [' '.join(filter(lambda x: x.startswith(('@','http','"','&','rt')) == False, sn[0].split() )), sn[1]] ) rddr2 = rddr2.map(lambda sn: [' '.join(filter(lambda x: (x.endswith(('"')) == False) and len(x) > 2 , sn[0].split() )), sn[1]] ) rddr2 = rddr2.map(lambda sn: [' '.join(filter(lambda x: "a's" != x and x != "able" and x != "about" and x != "above" and x != "according" and x != "accordingly" and x != "across" and x != "actually" and x != "after" and x != "afterwards" and x != "again" and x != "against" and x != "ain't" and x != "all" and x != "allow" and x != "allows" and x != "almost" and x != "alone" and x != "along" and x != "already" and x != "also" and x != "although" and x != "always" and x != "am" and x != "among" and x != "amongst" and x != "an" and x != "and" and x != "another" and x != "any" and x != "anybody" and x != "anyhow" and x != "anyone" and x != "anything" and x != "anyway" and x != "anyways" and x != "anywhere" and x != "apart" and x != "appear" and x != "appreciate" and x != "appropriate" and x != "are" and x != "aren't" and x != "around" and x != "as" and x != "aside" and x != "ask" and x != "asking" and x != "associated" and x != "at" and x != "available" and x != "away" and x != "awfully" and x != "b" and x != "be" and x != "became" and x != "because" and x != "become" and x != "becomes" and x != "becoming" and x != "been" and x != "before" and x != "beforehand" and x != "behind" and x != "being" and x != "believe" and x != "below" and x != "beside" and x != "besides" and x != "best" and x != "better" and x != "between" and x != "beyond" and x != "both" and x != "brief" and x != "but" and x != "by" and x != "c" and x != "c'mon" and x != "c's" and x != "came" and x != "can" and x != "can't" and x != "cannot" and x != "cant" and x != "cause" and x != "causes" and x != "certain" and x != "certainly" and x != "changes" and x != "clearly" and x != "co" and x != "com" and x != "come" and x != "comes" and x != "concerning" and x != "consequently" and x != "consider" and x != "considering" and x != "contain" and x != "containing" and x != "contains" and x != "corresponding" and x != "could" and x != "couldn't" and x != "course" and x != "currently" and x != "d" and x != "definitely" and x != "described" and x != "despite" and x != "did" and x != "didn't" and x != "different" and x != "do" and x != "does" and x != "doesn't" and x != "doing" and x != "don't" and x != "done" and x != "down" and x != "downwards" and x != "during" and x != "e" and x != "each" and x != "edu" and x != "eg" and x != "eight" and x != "either" and x != "else" and x != "elsewhere" and x != "enough" and x != "entirely" and x != "especially" and x != "et" and x != "etc" and x != "even" and x != "ever" and x != "every" and x != "everybody" and x != "everyone" and x != "everything" and x != "everywhere" and x != "ex" and x != "exactly" and x != "example" and x != "except" and x != "f" and x != "far" and x != "few" and x != "fifth" and x != "first" and x != "five" and x != "followed" and x != "following" and x != "follows" and x != "for" and x != "former" and x != "formerly" and x != "forth" and x != "four" and x != "from" and x != "further" and x != "furthermore" and x != "g" and x != "get" and x != "gets" and x != "getting" and x != "given" and x != "gives" and x != "go" and x != "goes" and x != "going" and x != "gone" and x != "got" and x != "gotten" and x != "greetings" and x != "h" and x != "had" and x != "hadn't" and x != "happens" and x != "hardly" and x != "has" and x != "hasn't" and x != "have" and x != "haven't" and x != "having" and x != "he" and x != "he's" and x != "hello" and x != "help" and x != "hence" and x != "her" and x != "here" and x != "here's" and x != "hereafter" and x != "hereby" and x != "herein" and x != "hereupon" and x != "hers" and x != "herself" and x != "hi" and x != "him" and x != "himself" and x != "his" and x != "hither" and x != "hopefully" and x != "how" and x != "howbeit" and x != "however" and x != "i" and x != "i'd" and x != "i'll" and x != "i'm" and x != "i've" and x != "ie" and x != "if" and x != "ignored" and x != "immediate" and x != "in" and x != "inasmuch" and x != "inc" and x != "indeed" and x != "indicate" and x != "indicated" and x != "indicates" and x != "inner" and x != "insofar" and x != "instead" and x != "into" and x != "inward" and x != "is" and x != "isn't" and x != "it" and x != "it'd" and x != "it'll" and x != "it's" and x != "its" and x != "itself" and x != "j" and x != "just" and x != "k" and x != "keep" and x != "keeps" and x != "kept" and x != "know" and x != "known" and x != "knows" and x != "l" and x != "last" and x != "lately" and x != "later" and x != "latter" and x != "latterly" and x != "least" and x != "less" and x != "lest" and x != "let" and x != "let's" and x != "like" and x != "liked" and x != "likely" and x != "little" and x != "look" and x != "looking" and x != "looks" and x != "ltd" and x != "m" and x != "mainly" and x != "many" and x != "may" and x != "maybe" and x != "me" and x != "mean" and x != "meanwhile" and x != "merely" and x != "might" and x != "more" and x != "moreover" and x != "most" and x != "mostly" and x != "much" and x != "must" and x != "my" and x != "myself" and x != "n" and x != "name" and x != "namely" and x != "nd" and x != "near" and x != "nearly" and x != "necessary" and x != "need" and x != "needs" and x != "neither" and x != "never" and x != "nevertheless" and x != "new" and x != "next" and x != "nine" and x != "no" and x != "nobody" and x != "non" and x != "none" and x != "noone" and x != "nor" and x != "normally" and x != "not" and x != "nothing" and x != "novel" and x != "now" and x != "nowhere" and x != "o" and x != "obviously" and x != "of" and x != "off" and x != "often" and x != "oh" and x != "ok" and x != "okay" and x != "old" and x != "on" and x != "once" and x != "one" and x != "ones" and x != "only" and x != "onto" and x != "or" and x != "other" and x != "others" and x != "otherwise" and x != "ought" and x != "our" and x != "ours" and x != "ourselves" and x != "out" and x != "outside" and x != "over" and x != "overall" and x != "own" and x != "p" and x != "particular" and x != "particularly" and x != "per" and x != "perhaps" and x != "placed" and x != "please" and x != "plus" and x != "possible" and x != "presumably" and x != "probably" and x != "provides" and x != "q" and x != "que" and x != "quite" and x != "qv" and x != "r" and x != "rather" and x != "rd" and x != "re" and x != "really" and x != "reasonably" and x != "regarding" and x != "regardless" and x != "regards" and x != "relatively" and x != "respectively" and x != "right" and x != "s" and x != "said" and x != "same" and x != "saw" and x != "say" and x != "saying" and x != "says" and x != "second" and x != "secondly" and x != "see" and x != "seeing" and x != "seem" and x != "seemed" and x != "seeming" and x != "seems" and x != "seen" and x != "self" and x != "selves" and x != "sensible" and x != "sent" and x != "serious" and x != "seriously" and x != "seven" and x != "several" and x != "shall" and x != "she" and x != "should" and x != "shouldn't" and x != "since" and x != "six" and x != "so" and x != "some" and x != "somebody" and x != "somehow" and x != "someone" and x != "something" and x != "sometime" and x != "sometimes" and x != "somewhat" and x != "somewhere" and x != "soon" and x != "sorry" and x != "specified" and x != "specify" and x != "specifying" and x != "still" and x != "sub" and x != "such" and x != "sup" and x != "sure" and x != "t" and x != "t's" and x != "take" and x != "taken" and x != "tell" and x != "tends" and x != "th" and x != "than" and x != "thank" and x != "thanks" and x != "thanx" and x != "that" and x != "that's" and x != "thats" and x != "the" and x != "their" and x != "theirs" and x != "them" and x != "themselves" and x != "then" and x != "thence" and x != "there" and x != "there's" and x != "thereafter" and x != "thereby" and x != "therefore" and x != "therein" and x != "theres" and x != "thereupon" and x != "these" and x != "they" and x != "they'd" and x != "they'll" and x != "they're" and x != "they've" and x != "think" and x != "third" and x != "this" and x != "thorough" and x != "thoroughly" and x != "those" and x != "though" and x != "three" and x != "through" and x != "throughout" and x != "thru" and x != "thus" and x != "to" and x != "together" and x != "too" and x != "took" and x != "toward" and x != "towards" and x != "tried" and x != "tries" and x != "truly" and x != "try" and x != "trying" and x != "twice" and x != "two" and x != "u" and x != "un" and x != "under" and x != "unfortunately" and x != "unless" and x != "unlikely" and x != "until" and x != "unto" and x != "up" and x != "upon" and x != "us" and x != "use" and x != "used" and x != "useful" and x != "uses" and x != "using" and x != "usually" and x != "uucp" and x != "v" and x != "value" and x != "various" and x != "very" and x != "via" and x != "viz" and x != "vs" and x != "w" and x != "want" and x != "wants" and x != "was" and x != "wasn't" and x != "way" and x != "we" and x != "we'd" and x != "we'll" and x != "we're" and x != "we've" and x != "welcome" and x != "well" and x != "went" and x != "were" and x != "weren't" and x != "what" and x != "what's" and x != "whatever" and x != "when" and x != "whence" and x != "whenever" and x != "where" and x != "where's" and x != "whereafter" and x != "whereas" and x != "whereby" and x != "wherein" and x != "whereupon" and x != "wherever" and x != "whether" and x != "which" and x != "while" and x != "whither" and x != "who" and x != "who's" and x != "whoever" and x != "whole" and x != "whom" and x != "whose" and x != "why" and x != "will" and x != "willing" and x != "wish" and x != "with" and x != "within" and x != "without" and x != "won't" and x != "wonder" and x != "would" and x != "wouldn't" and x != "x" and x != "y" and x != "yes" and x != "yet" and x != "you" and x != "you'd" and x != "you'll" and x != "you're" and x != "you've" and x != "your" and x != "yours" and x != "yourself" and x != "yourselves" and x != "z" and x != "zero" , sn[0].split() )), sn[1]] ) partsDF2 = spark.createDataFrame( rddr2.map(lambda x : Row(sentence=str.strip(x[0]), label=int(x[1]))) ) tokenized2 = tokenizer.transform(partsDF2) base_words2 = remover.transform(tokenized2) train_data_raw2 = base_words2.select("base_words", "label") word2Vec2 = Word2Vec(vectorSize=3, minCount=0, inputCol="base_words", outputCol="features") model2 = word2Vec2.fit(train_data_raw2) final_train_data2 = model2.transform(train_data_raw2) final_train_data2 = final_train_data2.select("label", "features") lrModel.transform(final_train_data2) return lrModel
def main(): spark = SparkSession.builder \ .appName("Spark CV-job ad matching") \ .config("spark.some.config.option", "some-value") \ .master("local[*]") \ .getOrCreate() VECTOR_SIZE = 50 df_jobs = spark.read.json("alljobs4rdd/alljobs.jsonl").filter("description is not NULL").cache() df_jobs.registerTempTable("jobs") df_cvs = spark.read.json("allcvs4rdd/allcvs.jsonl").cache() df_cvs.registerTempTable("cvs") df_categories = spark.read.json("allcategories4rdd/allcategories.jsonl").cache() df_categories.registerTempTable("categories") joined = spark.sql("SELECT description AS text, jobId AS id, 'job' AS type FROM jobs UNION ALL \ SELECT description AS text, cvid AS id, 'cv' AS type FROM cvs UNION ALL \ SELECT skillText AS text, id AS id, 'categories' AS type FROM categories") tokenizer = Tokenizer(inputCol="text", outputCol="words") tokenized = tokenizer.transform(joined) remover = StopWordsRemover(inputCol="words", outputCol="filtered") removed = remover.transform(tokenized) word2Vec = Word2Vec(vectorSize=VECTOR_SIZE, minCount=0, inputCol="filtered", outputCol="vectors") model = word2Vec.fit(removed) resultDF = model.transform(removed) normalizer = Normalizer(inputCol="vectors", outputCol="result", p=2) l1NormData = normalizer.transform(resultDF) l1NormData.registerTempTable("resultTable") jobs = spark.sql("SELECT result AS jobsVec, id AS jobId FROM resultTable WHERE type = 'job'") cvs = spark.sql("SELECT result AS cvsVec, id AS cvid FROM resultTable WHERE type = 'cv'") categories = spark.sql("SELECT result AS categoriesVec, cat.id, cat.skillName, category FROM resultTable AS rt\ LEFT JOIN categories AS cat ON rt.id = cat.id WHERE type = 'categories'") #Calculate job-cv similarity START crossJoined_job_cv = jobs.crossJoin(cvs) calculated_job_cv = crossJoined_job_cv.rdd.map(lambda x: (x.jobId, x.cvid, calculate_distance(x.jobsVec, x.cvsVec)))\ .toDF(["jobid", "cvid", "distance"]).orderBy(asc("jobid")).coalesce(2) calculated_job_cv.write.csv('Calculated/word2vec2/job-cv') #Calculate job-cv similarity END #Calculate cv-category similarity START crossJoined_cv_cat = cvs.crossJoin(categories) calculated_cv_cat = crossJoined_cv_cat.rdd.map(lambda x: (x.cvid, x.id, x.skillName, x.category, calculate_distance(x.cvsVec, x.categoriesVec)))\ .toDF(["cvid", "category_id", "skillName", "category", "distance"]).orderBy(asc("cvid"), asc("distance")).coalesce(2) calculated_cv_cat.write.csv('Calculated/word2vec2/cv-category') #Calculate cv-category similarity END #Job-category START crossJoined_job_cat = jobs.select("jobId", "jobsVec").crossJoin(categories.select("id", "skillName", "category", "categoriesVec")) calculatedDF_job_cat = crossJoined_job_cat.rdd\ .map(lambda x: (x.jobId, x.id, x.skillName, x.category, calculate_distance(x.jobsVec, x.categoriesVec)))\ .toDF(["jobid", "catid", "skillName", "category", "distance"]) ordered_job_cat = calculatedDF_job_cat.orderBy( asc("distance")).coalesce(2) ordered_job_cat.write.csv('Calculated/word2vec2/job-category')
def trainItem2vec(dataset, filename, saveToRedis=False, redisKeyPrefix=None): ''' 训练产生embedding,inputCol需要是 array(string)类型 训练好后写入 filename :param dataset: :return: ''' word2vec = Word2Vec(vectorSize=10, windowSize=5, maxIter=10, inputCol='movieIds') model = word2vec.fit(dataset) print('model fitted') # 打印相似电影,基于点积运算 synonyms = model.findSynonymsArray('158', 20) for moveid, similarity in synonyms: print('{}:{}'.format(moveid, similarity)) with open('./modeldata/{}'.format(filename), 'w') as f: for row in model.getVectors().collect(): tmp = ','.join([str(vector) for vector in row['vector']]) f.write('{}:{}\n'.format(row['word'], tmp)) # redis-cli eval "redis.call('del', unpack(redis.call('keys','*')))" 0 windows批量删除key if saveToRedis: pool = redis.ConnectionPool(host=HOST, port=PORT) # key的存活时间 秒 ex = 60 * 10 r = redis.Redis(connection_pool=pool) for i, row in enumerate(model.getVectors().collect()): tmp = ','.join([str(vector) for vector in row['vector']]) if i == 1: print(type(row['vector'])) r.set('{}:{}'.format(redisKeyPrefix, row['word']), tmp, ex) return model
def analyzeSent(host, port, db_name, ALL_WEB_DATA): ''' 对拆分的句子进行聚类分析 :return: ''' sen = senSplit(host, port, db_name, ALL_WEB_DATA) df = spark.createDataFrame(sen) def tokenizer(row): ''' 文档映射处理 :param row: :return: ''' result = list() row_sp = ''.join(row.sent.split()) result.append(row_sp) return [result] rdd = df.selectExpr('_1 as sent').rdd.map(tokenizer) df = spark.createDataFrame(rdd, ['sent']) wv_df = Word2Vec(vectorSize=5, minCount=0, inputCol="sent", outputCol="features") model_wv = wv_df.fit(df) wv_df = model_wv.transform(df) # model_wv.getVectors().show(truncate=False) km = KMeans(featuresCol="features", k=5) model_km = km.fit(wv_df) df_km = model_km.transform(wv_df) df_km.select('sent', 'prediction').show() df_km.select('sent', 'prediction').show(truncate=False)
def transform(self): word2Vec = Word2Vec(vectorSize=self._vector_size, inputCol="content", outputCol="features") model = word2Vec.fit(self._mapped_data) w2v_data = model.transform(self._mapped_data) return w2v_data.drop('content')
def Main(): spark = SparkSession.builder\ .appName("Word2Vec")\ .config("spark.driver.cores", "8")\ .config("spark.driver.maxResultSize", "13312m")\ .config("spark.driver.memory", "26624m")\ .config("spark.executor.cores", "8")\ .config("spark.executor.memory", "37237m")\ .getOrCreate() FILE_NO = 7 total_df = [] for idx in range(FILE_NO): each_df = spark.read.format("json") \ .option("mode", "FAILFAST") \ .option("inferSchema", "true") \ .load("gs://dataproc-7e10897a-5391-4ea0-b815-f6e72cf284f7-asia-east1/data/contents/data.{}".format(idx)) total_df.append(each_df) df = reduce(DataFrame.unionAll, total_df) df = df.select(df.id, explode(df.morphs).alias("words")) word2vec = Word2Vec(vectorSize=300, minCount=0, windowSize=2, numPartitions=10, inputCol="words", outputCol="vector") model = word2vec.fit(df) df = model.transform(df) final = df.groupBy("id") \ .agg(collect_list(struct("vector")).alias("matrix")) final.show(1)
def Doc2vec(args, train_data, vector_size=10, window_size=5, input_col='reference_list'): word2Vec = Word2Vec(vectorSize=vector_size, minCount=2, seed=42, maxIter=1, windowSize=window_size, inputCol=input_col, outputCol="ref_vec") model = word2Vec.fit(train_data) model_name = "{}_{}_{}_{}".format(args.model_name, vector_size, window_size, args.is_item) model.write().overwrite().save(os.path.join(model_pth, model_name)) vec = model.getVectors() train_data = model.transform(train_data) train_data = train_data.join(vec, train_data.impression == vec.word, how='left') train_data = train_data.withColumn('score', getCosinDis("ref_vec", 'vector')) train_data = train_data.withColumn("row_number", F.rank().over( Window.partitionBy(GR_COLS).orderBy(train_data["score"].desc()))) data = train_data.select(GR_COLS + ['action_type', 'impressions', 'reference', 'score', 'row_number', 'impression']).filter("reference==impression") demo_cnt = data.cube("row_number").count().toPandas() demo_cnt = demo_cnt.dropna() # demo_cnt.toPandas(".csv".format(model_name)) MRR = ((demo_cnt['count'] / demo_cnt['count'].sum()) * (1. / (26.0 - demo_cnt['row_number']))).sum() print('**---**' * 20) print("InputCol:{} vector_size:{} window_size:{} MRR:{}".format(input_col, vector_size, window_size, MRR)) return MRR
def word2vec(self): from pyspark.ml.feature import Word2Vec documentDF = self.session.createDataFrame( [("Hi I heard about Spark".split(" "), ), ("I wish Java could use case classes".split(" "), ), ("Logistic regression models are neat".split(" "), )], ["text"]) word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="text", outputCol="result") model = word2Vec.fit(documentDF) # transform 其实只是做了个词向量求平均 result = model.transform(documentDF) for row in result.collect(): text, vector = row print("Text: [%s] => \nVector: %s\n" % (", ".join(text), str(vector))) # 如果我希望把向量都拿出,以后在用呢? res = dict([(item["word"], item["vector"].toArray()) for item in model.getVectors().collect()]) print(res["heard"])
def compute_word2vec(self, input_df, output_vec_len, window_size=5, sub_test=False): """ Compute the word2vec for a given dataframe @param input_df : the dataframe to perform the action upon @param output_vec_len : the length (int) of the output vector @param input_col : the name (string) of the input column @param output_col : the name (string) of the output column @return output dataframe with output column """ # ensure that the input column is of type StringType() toArray = udf(lambda vs: vs, ArrayType(StringType())) toArray1 = udf(lambda vs: vs.toArray()) df = input_df.withColumn(self.input_col, toArray(input_df[self.input_col])) # initialize word2vec word2Vec = Word2Vec(vectorSize=output_vec_len, windowSize=window_size, minCount=5, inputCol=self.input_col, outputCol=self.output_col) # train word2vec model model = word2Vec.fit(df) # compute transformation result = model.transform(df) # convert result to a vector if not sub_test: conv = udf(lambda vs: Vectors.dense(vs), VectorUDT()) out = result.withColumn(output_col, conv(result[output_col])) return out else: return result
def frequency_vector_DataFrame(trainDF, cluster_count): regTokenizer = RegexTokenizer(inputCol="reviewText", outputCol="words", pattern="[^a-z]") dfTokenizer = regTokenizer.transform(trainDF) remover = StopWordsRemover(inputCol="words", outputCol="filtered") df_remover = remover.transform(dfTokenizer) # feature extraction using Word2vec word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="filtered", outputCol="word2vec") vectors = word2Vec.fit(df_remover).getVectors() vectors_DF = vectors.select(vectors.word, vectors.vector.alias("features")) # DF as kmeans kmeans = KMeans().setK(cluster_count).setSeed(1) km_model = kmeans.fit(vectors_DF) # Broadcast operation after getting the words and predictions vocabDF = km_model.transform(vectors_DF).select("word", "prediction") vocabDict = dict(vocabDF.rdd.collect()) vocab_dict = sc.broadcast(vocabDict) # Cluster vector is in RDD form reviewsDF = df_remover.select(df_remover.filtered, df_remover.label).rdd clusterVectorRdd = reviewsDF.map(partial(word_to_cluster, vocab_dict=vocab_dict)) cluster_frequency_feature_Rdd = clusterVectorRdd.map(partial(cluster_frequency_vector, cluster_count=cluster_count)) cluster_freqDF = cluster_frequency_feature_Rdd.map(lambda (x, y): Row(x, y)).toDF() cluster_freq_featureDF = cluster_freqDF.select(cluster_freqDF._1.alias("features"), cluster_freqDF._2.alias("label")) return cluster_freq_featureDF
def trainData(): #rdd = sc.parallelize(rdd) #rdd.foreach(print) #rdd = sc.textFile("/ccga/SentimentAnalysisDataset.csv") '''#################################################TRAINING DATA SET#################################################''' rddTrain = sc.textFile("/ccga/set100k.csv") r = rddTrain.mapPartitions(lambda x: csv.reader(x)) parts = r.map(lambda x: Row(sentence=str.strip(x[3]), label=int(x[1]))) spark = getSparkSessionInstance(rddTrain.context.getConf()) partsDF = spark.createDataFrame(parts) #partsDF.show(truncate=False) tokenizer = Tokenizer(inputCol="sentence", outputCol="words") tokenized = tokenizer.transform(partsDF) #okenized.show(truncate=False) remover = StopWordsRemover(inputCol="words", outputCol="base_words") base_words = remover.transform(tokenized) #base_words.show(truncate=False) train_data_raw = base_words.select("base_words", "label") #train_data_raw.show(truncate=False) #base_words = train_data_raw.select("base_words") #base_words_rdd = base_words.rdd #print(base_words_rdd.collect()) #base_words_map = base_words_rdd.flatMap(lambda x: x[0]) #base_words_rdd.collect() word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="base_words", outputCol="features") model = word2Vec.fit(train_data_raw) final_train_data = model.transform(train_data_raw) #final_train_data.show() final_train_data = final_train_data.select("label", "features") #final_train_data.show(truncate=False) lr = LogisticRegression(maxIter=1000, regParam=0.001, elasticNetParam=0.0001) lrModel = lr.fit(final_train_data) trained = lrModel.transform(final_train_data) return lrModel '''#################################################TRAINING DATA SET#################################################'''
def train_word2vec(data): word2Vec = Word2Vec(vectorSize=embedded_size, minCount=0, inputCol="comment", outputCol="result") model = word2Vec.fit(data) # model.save(sc,"train_results/word2vec.train") return model
def learn_with(dataset=None, save=True): [data, target_multi, target_single] = dataset # 利用word2vec构建词向量, 词向量长度100 documentDF = spark.createDataFrame( [(data[i].split(" "), emoji_id_mapper[target_single[i]]) for i in range(len(data))], ["text", "label"]) word2Vec = Word2Vec(vectorSize=100, minCount=0, inputCol="text", outputCol="features") model = word2Vec.fit(documentDF) result = model.transform(documentDF) result.select("label", "features").show() # train & test data (trainingData, testData) = result.randomSplit([0.8, 0.2], seed=100) print("Training Dataset Count: " + str(trainingData.count())) print("Test Dataset Count: " + str(testData.count())) # 建立 LR 模型 lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0) lrModel = lr.fit(trainingData) predictions = lrModel.transform(testData) predictions.select("features", "label", "prediction") \ .show(n=30, truncate=30) if save: lrModel.save('trained_models/')
def main(): data_schema = types.StructType([ types.StructField('title', types.StringType()), types.StructField('text', types.StringType()), types.StructField('label', types.IntegerType()) ]) datadf = spark.read.csv("s3://projfakenews/ProcessedDatawithoutStemming", schema=data_schema) datadf.show() word2Vec = Word2Vec(vectorSize=100, minCount=0, inputCol="text", outputCol="features") datadf = datadf.withColumn("text", functions.array("text")) model = word2Vec.fit(datadf) result = model.transform(datadf) result.show() temp = result.select("features").show(1) result = result.dropna() result = result.randomSplit([0.8, 0.2], 24) print(result[0].count(), result[1].count()) make_model(result[0], result[1])
def main(*args): if len(args) != 2: print("Please provide both input and output directories!") sys.exit(1) input_fn, output_fn = args[0], args[1] conf = SparkConf() conf.setAppName("Word2Vec") sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) # create the post table contains the tags info as a string posts = sc.textFile(input_fn) df_post = ((posts.map(lambda line: line.strip()).filter( lambda line: line.startswith('<row')).filter( lambda line: '/>' in line).map( Post.parse).map(lambda x: (x.owneruserid, x.tags)).toDF( ['ownerid', 'tags']))) # parse the tags using the generic functions into a list of words df_tags = (df_post.withColumn( 'input', F.regexp_replace(F.col('tags'), '<', '')).withColumn( 'input', F.lower(F.col('input'))).withColumn('input', F.split(F.col('input'), '>'))) # build the machine learning pipeline w2v = Word2Vec(inputCol="input", outputCol="vectors", vectorSize=100, minCount=10, seed=42) model = w2v.fit(df_tags) result = model.transform(df_tags) (model.findSynonyms( "ggplot2", 25).rdd.map(lambda x: (x[0], x[1])).saveAsTextFile(output_fn))
def get_pipeline(vector_size=50, class_num=5, stopwords=None): ''' 构建pipeline 该demo pipeline包含以下步骤: 1. labelIndexer 将标签索引,从字符装化为整数 2. tokenizer 将句子分成单词 3. remover 去除停用词 4. word2vec 使用word2vec将文本转化为低维度向量 5. mpc 神经网络分类器 ''' labelIndexer = StringIndexer(inputCol="label", outputCol="indexLabel") tokenizer = Tokenizer(inputCol="text", outputCol="raw_words") remover = StopWordsRemover(inputCol="raw_words", outputCol="words", stopWords=stopwords) word2vec = Word2Vec(vectorSize=vector_size, minCount=2, inputCol="words", outputCol="vector") layers = [vector_size, (vector_size + class_num) / 2, class_num] mpc = MultilayerPerceptronClassifier(maxIter=100, layers=layers, seed=1234, featuresCol="vector", labelCol="indexLabel") pipeline = Pipeline( stages=[labelIndexer, tokenizer, remover, word2vec, mpc]) return pipeline
def textstages(inputCol='stemmed'): #TF-IDF is a bag-of-words function, which gives higher weight to words that appear #frequently on a document but not frequently in all of the documents. The output is a features column. tf = HashingTF(inputCol=inputCol, outputCol='rawFeatures', numFeatures=500) idf = IDF(inputCol='rawFeatures', outputCol='features', minDocFreq=2.0) #idf = IDF(inputCol='rawFeatures', outputCol='features', minDocFreq=2.0) #Word2Vec is a Word Embedding function, which represents each word as a vector, #with words with similar meanings having neighboring vectors. The output is a feature column. word2vec = Word2Vec(vectorSize=300, inputCol=inputCol, outputCol='features') #Document Assembler to get Annotators (data type used by spark-NLP) docas = DocumentAssembler().setInputCol('joinedLem').setOutputCol( 'document') tok = Tokenizer().setInputCols(['document']).setOutputCol('token') #add BERT class bert = BertEmbeddings.pretrained('bert_base_uncased', 'en').setInputCols( ['document', 'token']).setOutputCol('bertFeatures') embfin = sparknlp.EmbeddingsFinisher()\ .setInputCols('bertFeatures')\ .setOutputCols('finfeatures')\ .setOutputAsVector(True) embfinfin = EmbeddingsFinisherFinisher(inputCol='finfeatures', outputCol='features') return [[tf, idf], [word2vec], [docas, tok, bert, embfin, embfinfin]]
def build_pipeline(classifier='rf', max_depth=7): """ creates a pipeline of functionalities to be applied on the training set """ # Training: Tokenize, Removing stop words, calculating n-grams, calcuating frequencies tokenizer = RegexTokenizer(inputCol="text", outputCol="words", pattern='\w{8}|\s') remover = StopWordsRemover(inputCol='words', outputCol='filtered', stopWords=['??']) ngram_2 = NGram(n=2, inputCol='filtered', outputCol='ngrams') ngram_3 = NGram(n=3, inputCol='filtered', outputCol='ngrams') hashingTF = HashingTF(inputCol="ngrams", outputCol="features") word2vec = Word2Vec(inputCol='ngrams', outputCol='features') if classifier == 'rf': clf = RandomForestClassifier(maxDepth=max_depth) stages = [tokenizer, remover, ngram_2, hashingTF, clf] elif classifier == 'nb': clf = NaiveBayes(smoothing=1) stages = [tokenizer, remover, ngram_3, hashingTF, clf] elif classifier == 'lr': clf = LogisticRegression() stages = [tokenizer, remover, ngram_2, word2vec, clf] else: raise ValueError("classifier must be 'rf', 'nb', or 'lr'.") return stages
def main(sc, spark): # Load the Corpus corpus = load_corpus(sc, spark) # Create the vector/cluster pipeline pipeline = Pipeline(stages=[ Tokenizer(inputCol="text", outputCol="tokens"), Word2Vec(vectorSize=7, minCount=0, inputCol="tokens", outputCol="vecs"), BisectingKMeans(k=10, featuresCol="vecs", maxIter=10), ]) # Fit the model model = pipeline.fit(corpus) corpus = model.transform(corpus) # Evaluate clustering. bkm = model.stages[-1] cost = bkm.computeCost(corpus) sizes = bkm.summary.clusterSizes # TODO: compute cost of each cluster individually # Get the text representation of each cluster. wvec = model.stages[-2] table = [["Cluster", "Size", "Terms"]] for ci, c in enumerate(bkm.clusterCenters()): ct = wvec.findSynonyms(c, 7) size = sizes[ci] terms = " ".join([row.word for row in ct.take(7)]) table.append([ci, size, terms]) # Print Results print(tabulate(table)) print("Sum of square distance to center: {:0.3f}".format(cost))
def main(train_data, test_data, sc, sqlContext, output): text = sqlContext.read.json(train_data) train_df = text.select(text.reviewText, text.overall.alias("label")) # regextokenizer to split the words regexTokenizer = RegexTokenizer(inputCol="reviewText", outputCol="words", pattern="\\W|[0-9]") remover = StopWordsRemover(inputCol="words", outputCol="filtered") #word2vec for representing as vectors word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="filtered", outputCol="features") lr = LinearRegression(maxIter=20, regParam=0.1, elasticNetParam=0.8) pipeline = Pipeline(stages=[regexTokenizer, remover, word2Vec, lr]) paramGrid = (ParamGridBuilder().addGrid( lr.regParam, [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]).build()) crossval = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=RegressionEvaluator(), numFolds=5) # 5 fold cross validation cv_model = crossval.fit(train_df) # Training data evaluation train_prediction = cv_model.transform(train_df) print train_prediction.show() train_evaluator = RegressionEvaluator(metricName="rmse", labelCol="label", predictionCol="prediction") train_rmse = train_evaluator.evaluate(train_prediction) text_test = sqlContext.read.json(test_data) test_df = text_test.select(text_test.reviewText, text_test.overall.alias("label")) # Testing data evaluation test_prediction = cv_model.transform(test_df) print test_prediction.show() test_evaluator = RegressionEvaluator(metricName="rmse", labelCol="label", predictionCol="prediction") test_rmse = test_evaluator.evaluate(test_prediction) print("Training Root mean square error = " + str(train_rmse)) print("Testing Root mean square error = " + str(test_rmse)) #output writen to file out_file = open(output, 'w') out_file.write(str(train_rmse)) out_file.write(str(test_rmse)) out_file.close()
def sentimentAnalyze(): #rdd = sc.textFile("/ccga/SentimentAnalysisDataset.csv") '''#################################################FIRST DATA SET#################################################''' rdd = sc.textFile("/ccga/SentimentTrain60k.csv") r = rdd.mapPartitions(lambda x: csv.reader(x)) parts = r.map(lambda x: Row(sentence=str.strip(x[3]), label=int(x[1]))) spark = getSparkSessionInstance(rdd.context.getConf()) partsDF = spark.createDataFrame(parts) #partsDF.show(truncate=False) tokenizer = Tokenizer(inputCol="sentence", outputCol="words") tokenized = tokenizer.transform(partsDF) #okenized.show(truncate=False) remover = StopWordsRemover(inputCol="words", outputCol="base_words") base_words = remover.transform(tokenized) #base_words.show(truncate=False) train_data_raw = base_words.select("base_words", "label") #train_data_raw.show(truncate=False) #base_words = train_data_raw.select("base_words") #base_words_rdd = base_words.rdd #print(base_words_rdd.collect()) #base_words_map = base_words_rdd.flatMap(lambda x: x[0]) #base_words_rdd.collect() word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="base_words", outputCol="features") model = word2Vec.fit(train_data_raw) final_train_data = model.transform(train_data_raw) #final_train_data.show() final_train_data = final_train_data.select("label", "features") #final_train_data.show(truncate=False) lr = LogisticRegression(maxIter=1000, regParam=0.001, elasticNetParam=0.0001) lrModel = lr.fit(final_train_data) trained = lrModel.transform(final_train_data) '''#################################################FIRST DATA SET#################################################''' '''#################################################SECOND DATA SET#################################################''' rdd2 = sc.textFile("/ccga/SentimentTest40k.csv") r2 = rdd2.mapPartitions(lambda x: csv.reader(x)) parts2 = r2.map(lambda x: Row( sentence=str.strip(x[3]), label=int(x[1]), tweet=str(x[3]))) spark2 = getSparkSessionInstance(rdd2.context.getConf()) partsDF2 = spark2.createDataFrame(parts2) tokenizer2 = Tokenizer(inputCol="sentence", outputCol="words") tokenized2 = tokenizer2.transform(partsDF2) remover2 = StopWordsRemover(inputCol="words", outputCol="base_words") base_words = remover2.transform(tokenized2) train_data_raw2 = base_words.select("base_words", "label", "tweet") final_train_data2 = model.transform(train_data_raw2) final_train_data2 = final_train_data2.select("label", "features", "tweet") predict = lrModel.transform(final_train_data2) trained.show() predict.show() '''#################################################SECOND DATA SET#################################################''' print( "-------------------------------------------Working perfect-------------------------------------------" )
def p1(time, rdd): rdd = rdd.map(lambda x: json.loads(x[1])).map(lambda x: x['text']).map( lambda x: x.upper()) #rdd=rdd.map(lambda x:x.upper()).filter(lambda tweet:tweet!="HTTP" and tweet!="/" and tweet!="RT" and tweet!="@") rdd_MAGA = rdd.filter(lambda x: "MAGA" in x).map(lambda x: [x, "MAGA"]) rdd_DICTATOR = rdd.filter(lambda x: "DICTATOR" in x).map( lambda x: [x, "DICTATOR"]) rdd_IMPEACH = rdd.filter(lambda x: "IMPEACH" in x).map( lambda x: [x, "IMPEACH"]) rdd_DRAIN = rdd.filter(lambda x: "DRAIN" in x).map(lambda x: [x, "DRAIN"]) rdd_SWAMP = rdd.filter(lambda x: "SWAMP" in x).map(lambda x: [x, "SWAMP"]) rdd_COMEY = rdd.filter(lambda x: "COMEY" in x).map(lambda x: [x, "COMEY"]) rdd1 = sc.union( [rdd_MAGA, rdd_DICTATOR, rdd_IMPEACH, rdd_DRAIN, rdd_SWAMP, rdd_COMEY]) parts = rdd1.map(lambda x: Row(sentence=x[0], label=x[1], date=time)) spark = getSparkSessionInstance(rdd.context.getConf()) partsDF = spark.createDataFrame(parts) #partsDF.show(truncate=False) tokenizer = Tokenizer(inputCol="sentence", outputCol="words") tokenized = tokenizer.transform(partsDF) #tokenized.show(truncate=False) remover = StopWordsRemover(inputCol="words", outputCol="base_words") base_words = remover.transform(tokenized) #base_words.show(truncate=False) train_data_raw = base_words.select("base_words", "label", "date") #train_data_raw.show(truncate=False) base_words = train_data_raw.select("base_words") #base_words.show(truncate=False) #Vectorize word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="base_words", outputCol="features") model = word2Vec.fit(train_data_raw) final_train_data3 = model.transform(train_data_raw) #final_train_data3.show() final_train_data3 = final_train_data3.select("label", "features", "date") #final_train_data3.show(truncate=False) final_model = lrModel.transform(final_train_data3) final_model.show() sentimentDataFrame = final_model.select("label", "date", "prediction") sentimentDataFrame.createOrReplaceTempView("sentimental") sentimentDataFrame = spark.sql( "select label, date, prediction, count(*) as total_label from sentimental group by label, date, prediction order by label" ) sentimentDataFrame.show() sentimentDataFrame.write.mode("append").saveAsTable("sentiment1")
def main(self): stop_words = [] # prod dataframe = self.read_dataframe(self.path, self.days_list).persist() # read approved user list # df = self.spark.read.csv( # "hdfs:///ssymmetry_db/raw_db/sina_user_tag/sina_user_tag_item/weibo_uid_with_user_tag.csv")\ # .select("uid", "user_tag") # local test # dataframe = self.spark.read.json("sina_weibo_fans_data_2017-11-09-10-18.json") blog_rdd = self.read_blog_data(dataframe).fillna(" ").rdd def preprocess_data(x): uid = x["uid"] blog_content = x["blog_content"] forward_content = x["forward_content"] if forward_content.rfind(u"*****") > 0: forward_content = forward_content.split(u"*****")[1] return (uid, blog_content + forward_content) data = blog_rdd.map(preprocess_data).reduceByKey( lambda x, y: x + y).map( lambda x: [" ".join(jieba.cut(x[1])).split(" ")]) sql_context = SQLContext(sparkContext=self.spark.sparkContext) word_df = sql_context.createDataFrame(data, ["values"]) w2vec = Word2Vec(vectorSize=128, inputCol="values") model = w2vec.fit(word_df) def creat_dictionary(model): w_df = model.getVectors() w_df.show() data = w_df.rdd.collect() w2index = {} w2vec = {} i = 1 for row in data: word = row.word vector = row.vector w2index[word] = i w2vec[word] = vector i += 1 return w2index, w2vec # 把word2vec的词向量写出到一个pickle文件中 index_dict, word_vectors = creat_dictionary(model) # out = open("w2vec.pkl", "wb") out = open("/udisk2/hxk/w2vec/w2vec.pkl", "wb") pickle.dump(index_dict, out) # 索引字典 pickle.dump(word_vectors, out) # 词向量字典 out.close() # test model.findSynonyms("你", 3).show()
def get_word_vec(self): data = self.merge_df.groupBy('user_id').agg( func.sort_array(func.collect_list(func.struct(func.col('time'), func.col('ad_id'))), asc=True).alias( 'items')) data = data.withColumn("items", func.udf(lambda x: [i[1] for i in x], ArrayType(StringType()))('items')) word2Vec = Word2Vec(vectorSize=128, minCount=10, inputCol="items", outputCol="result") model = word2Vec.fit(data.repartition(1000)) return model
def train(self): self.__prepare() spark = SparkSession\ .builder\ .appName("Kursach")\ .getOrCreate() input_file = spark.sparkContext.textFile('./w2v.txt') # print(input_file.collect()) prepared = input_file.map(lambda x: ([x])) df = prepared.toDF() prepared_df = df.selectExpr('_1 as text') tokenizer = Tokenizer(inputCol='text', outputCol='words') words = tokenizer.transform(prepared_df) stop_words = StopWordsRemover.loadDefaultStopWords('russian') remover = StopWordsRemover(inputCol='words', outputCol='filtered', stopWords=stop_words) filtered = remover.transform(words) # print(stop_words) # filtered.show() # words.select('words').show(truncate=False, vertical=True) # filtered.select('filtered').show(truncate=False, vertical=True) vectorizer = CountVectorizer(inputCol='filtered', outputCol='raw_features').fit(filtered) featurized_data = vectorizer.transform(filtered) featurized_data.cache() vocabulary = vectorizer.vocabulary # featurized_data.show() # featurized_data.select('raw_features').show(truncate=False, vertical=True) # print(vocabulary) idf = IDF(inputCol='raw_features', outputCol='features') idf_model = idf.fit(featurized_data) rescaled_data = idf_model.transform(featurized_data) self.__word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol='words', outputCol='result') self.__model = self.__word2Vec.fit(filtered) w2v_df = self.__model.transform(words) w2v_df.show() spark.stop()
def word2vec(words): word2Vec = Word2Vec(vectorSize=300, minCount=2, seed=42, inputCol="words", outputCol="features") model = word2Vec.fit(words) w2v = model.transform(words) return w2v
def method(traindata, testdata, model): regexTokenizer = RegexTokenizer(inputCol="text", outputCol="words", pattern="\\W") add_stopwords = ["http", "https", "amp", "rt", "t", "c", "the"] stopwordsRemover = StopWordsRemover( inputCol="words", outputCol="filtered").setStopWords(add_stopwords) label_stringIdx = StringIndexer(inputCol="airline_sentiment", outputCol="label") if (model == "Count"): x = CountVectorizer(inputCol="filtered", outputCol="features", vocabSize=10000, minDF=5) elif (model == "Word2Vec"): x = Word2Vec(vectorSize=1000, minCount=5, inputCol="filtered", outputCol="features") else: hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=10000) x = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=5) #minDocFreq: remove sparse terms lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0) if (model == "TFIDF"): pipeline = Pipeline(stages=[ regexTokenizer, stopwordsRemover, label_stringIdx, hashingTF, x, lr ]) else: pipeline = Pipeline( stages=[regexTokenizer, stopwordsRemover, label_stringIdx, x, lr]) pipelineFit = pipeline.fit(traindata) predictions = pipelineFit.transform(testdata) predictions.filter(predictions['prediction'] == 0).select( "text", "airline_sentiment", "probability", "label", "prediction").orderBy("probability", ascending=False).show(n=10, truncate=30) predictions.filter(predictions['prediction'] == 1).select( "text", "airline_sentiment", "probability", "label", "prediction").orderBy("probability", ascending=False).show(n=10, truncate=30) evaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction", labelCol="label") print("F1: %g" % (evaluator.evaluate(predictions))) c = "logreg" + model + ".model" pipelineFit.save(c)
def make_places_model(): spark = SparkSession \ .builder \ .appName("SimpleApplication") \ .getOrCreate() data = read_data_for_mode('forPlacesModel') persons = [] text = [] for person in data: NAME = person[0].split(' ') persons.append('_'.join(NAME)) text += person[1] def remove_punctuation(text): return re.sub(r'[^\w\s^-]', '', text) for i in range(len(text)): text[i] = remove_punctuation(text[i]) def remove_stop_words(text: list): ru_stop = stopwords.words('russian') tokens = [] for line in text: line_tokenized = line.split() for token in line_tokenized: if not token.lower() in ru_stop: tokens.append(token) return tokens tokens = remove_stop_words(text) def get_only_words(tokens): return list(filter(lambda x: re.match('[а-яА-Я]+', x), tokens)) tokens = get_only_words(tokens) full_text = ' '.join(tokens) documentDF = spark.createDataFrame([(full_text.split(" "), )], ["text"]) model = Word2Vec(vectorSize=3, minCount=0, inputCol="text", outputCol="result") model_fitted = model.fit(documentDF) model_transformed = model_fitted.transform(documentDF) model.save('/home/pok/sem/project/models/places/model0mincount/model') model_fitted.save( '/home/pok/sem/project/models/places/model0mincount/fitted') # model_transformed.save('/home/pok/sem/project/models/model0mincount/transformed') spark.stop()
def overlappingNgramWord2VecEncode(self, n = None, windowSize = None, vectorSize = None, fileName = None, sc = None): ''' Encodes a protein sequence by converting it into n-grams and then transforming it into a Word2Vec feature vector. If given word2Vec file name, then this function encodes a protein sequence by converting it into n-grams and then transforming it using pre-trained word2Vec model read from that file Attribute: n (int): The number of words in an n-gram windowSize (int): width of the window used to slide across the \ squence, context words from [-window,window] vectorSize (int): dimension of the feature vector fileName (str): filename of Word2Vec model Returns: dataset with features vector added to original dataset ''' # Create n-grams out of the sequence # E.g., 2-gram IDCGH, ... =>[ID, DC, CG, GH, ...] data = sequenceNgrammer.ngram(self.data, n, "ngram") if not (n == None and windowSize == None and vectorSize == None): # Convert n-grams to W2V freature vector # [ID, DC, CG, GH, ...] => [0.1234, 0.2394, ...] word2Vec = Word2Vec() word2Vec.setInputCol("ngram") \ .setOutputCol(self.outputCol) \ .setNumPartitions(8) \ .setWindowSize(windowSize) \ .setVectorSize(vectorSize) \ self.model = word2Vec.fit(data) elif fileName != None and sc != None: reader = Word2VecModel() self.model = reader.load(sc, fileName) print(f"model file : {fileName} \n \ inputCol : {self.model.getInputCol()} \n \ windowSize : {self.model.getWindowSize()} \n \ vectorSize : {self.model.getVectorSize()}") self.model.setOutputCol(self.outputCol) else: raise Exception("Either provide word2Vec file (filename) + SparkContext (sc), \ or number of words(n) + window size(windowSize) \ + vector size (vetorSize), for function parameters") return return self.model.transform(data)
def main(): conf = SparkConf().setAppName('Sentiment Analysis_Word2Vec') sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) train_inputs = sys.argv[1] test_inputs = sys.argv[2] schema = StructType([ StructField('reviewText', StringType(), False), StructField('overall', DoubleType(), False), ]) read_json = sqlContext.read.json(train_inputs, schema) read_json.registerTempTable('read_json') lowercase = sqlContext.sql(""" SELECT lower(reviewText) as reviewText, overall as label FROM read_json """) regexTokenizer = RegexTokenizer(inputCol="reviewText", outputCol="words", pattern="\\W") remover = StopWordsRemover(inputCol="words", outputCol="filtered") word2vec = Word2Vec(vectorSize=3, minCount=0, inputCol="filtered", outputCol="features") lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8) pipeline = Pipeline(stages=[regexTokenizer, remover, word2vec, lr]) paramGrid = ParamGridBuilder() \ .addGrid(lr.regParam, [0.1, 0.01]) \ .build() crossval = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=RegressionEvaluator(), numFolds=5) cvModel = crossval.fit(lowercase) testDF = sqlContext.read.json(test_inputs, schema) testDF.registerTempTable('test_data') test_data = sqlContext.sql(""" SELECT lower(reviewText) as reviewText, overall as label FROM test_data """) train_prediction = cvModel.transform(lowercase) test_prediction = cvModel.transform(test_data) evaluator = RegressionEvaluator() print "Training dataset RMSE error: %s" %str(evaluator.evaluate(train_prediction)) print "Testing dataset RMSE: %s" %str(evaluator.evaluate(test_prediction))