예제 #1
0
def predict():
    preproses()
    td = TFIDF([xdata, ydata])
    clasification = []

    # Receives the input query from form
    if request.method == 'POST':
        namequery = request.form['namequery']
        spliter = namequery.split(',')

        for row in spliter:
            clasification.append(testFromTrained([td.transform(row)]))
        print(clasification)
        keras.clear_session()

        labels, values = np.unique(clasification, return_counts=True)
        lbls, vals = np.unique(clasification, return_counts=True)

    pie_labels = labels
    pie_values = values
    colors = ["#F7464A", "#46BFBD"]

    return render_template('hasil.html',
                           set=zip(values, labels, colors),
                           clasification=zip(spliter, clasification),
                           legenda=zip(lbls, vals))
예제 #2
0
    def preprocess(self, filepath):
        dataset = pd.read_csv(filepath, delimiter=',')

        self.xData = []
        self.yData = []

        for k in dataset['Kalimat']:
            self.xData.append(k)

        for k in dataset['Formalitas']:
            self.yData.append(k)

        self.tfidf_data = TFIDF([self.xData, self.yData])
예제 #3
0
    def test_tfidf(self):
        """
        Test the TF-IDF scheme.
        """

        idf = {'a': 2, 'b': 1, 'c': 1}
        tokens = ['a', 'b', 'b', 'c', 'd']
        tfidf = TFIDF(idf, 3)

        document = tfidf.create(tokens)
        self.assertEqual(0, document.dimensions['a'])
        self.assertEqual(0.35218, round(document.dimensions['b'], 5))
        self.assertEqual(0.17609, round(document.dimensions['c'], 5))
        self.assertEqual(0.47712, round(document.dimensions['d'], 5))
예제 #4
0
    def parseQuery(self, query, invIndex):
        #Both handlers return the respective TF_IDFs
        #docTF_IDF can be run once after crawl
        tfidf = TFIDF()
        # print invIndex
        docTF_IDF = tfidf.docHandler(invIndex, 0)
        # print docTF_IDF
        queryTF_IDF = self.queryHandler(query, invIndex)
        if queryTF_IDF == -1:
            print "No words from your search were found in any documents...Please try new search terms!"
            return -1

        cosSimByDoc = self.cosSimilarityHandler(docTF_IDF, queryTF_IDF)
        # print "Cosine Similarity by document:", cosSimByDoc
        return cosSimByDoc
 def __train_models(self):
     # Now load all sentences from specific domain, and train TFIDF model and NGramPerplexity model.
     self.ngp = NGramPerplexity()
     self.tfidf = TFIDF()
     print("Training models from specific corpora")
     for file in os.listdir(self.input_dir):
         print("Training models from specific corpora: " + file)
         with open(self.input_dir + "/" + file, encoding="utf-8") as input:
             for line in input:
                 words = WordExtractor.get_words(line)
                 if len(words) == 0:
                     continue
                 self.sentences.append(words)
                 self.ngp.train_from_text(words)
                 self.tfidf.train_from_text(words)
예제 #6
0
def main():
    
    #try:
    c = corpus();
    tfidf = TFIDF()
    tf_type='aug_freq'
    idf_type='inv_smooth_idf'    
    for i, doc in enumerate(c.documents):
        cnt=0
        print("Top words in document {}".format(i + 1))
        scores = {word: tfidf.tfidf(word, doc, c.documents,tf_type, idf_type) for word in doc.words}
        sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)
        for word, score in sorted_words[:10]:
            cnt+=1
            if(score>0):
                print("\tWord {}: {}, TF-IDF: {}".format(cnt, word, round(score, 5)))
예제 #7
0
    def preproses(self, filepath):
        f = open(filepath)

        # split new line
        sents = f.read().split('\n')

        # shuffle all sentences order
        shuffle(sents)

        # on each sentence
        # - split by semicolon
        # - append to variable
        for sent in sents:
            temp = sent.split(';')
            if len(temp) == 2:
                self.xdata.append(temp[0])
                self.ydata.append([int(temp[1])])

        # prepare tfidf feature
        self.tfidf_data = TFIDF([self.xdata, self.ydata])
예제 #8
0
def wordcount(filename, ent_file, tfidf, text, id):
    resources = open(filename)
    resources.readline()  # header
    wordcount = TFIDF(get_entities(ent_file))
    for id, lines in groupby(csv.reader(resources), id):
        maintext = ' '.join(text(line).lower() for line in lines)
        wordcount.process(maintext)
    wordcount.done()

    out = open(tfidf, 'w')
    for word, _, _, tfidf in wordcount.highest(200):
        out.write('%s\t%f\n' % (word, tfidf))
예제 #9
0
def upload_file():
    if request.method == 'POST':
        if 'file' not in request.files:
            flash('Not file part')
            # return redirect(request.url)
        file = request.files['file']

        if file.filename == '':
            flask('not select file')
            # return redirect(request.url)
        if file and allowed_file(file.filename):
            filename = secure_filename(file.filename)
            file.save(os.path.join(app.config['UPLOAD_FOLDER'], filename))
            # return redirect(url_for('upload_file', filename=filename))
        print(filename)
        fold = "data/" + filename
        print(fold)
        with open(fold, 'r') as csv_par:
            preproses()
            td = TFIDF([xdata, ydata])
            clasification = []
            csv_reader = csv_par.read().split('\n')

    for row in csv_reader:
        clasification.append(testFromTrained([td.transform(row)]))

    keras.clear_session()
    labels, values = np.unique(clasification, return_counts=True)
    lbls, vals = np.unique(clasification, return_counts=True)

    pie_labels = labels
    pie_values = values
    colors = ["#F7464A", "#46BFBD"]

    return render_template('hasil.html',
                           set=zip(values, labels, colors),
                           clasification=zip(csv_reader, clasification),
                           legenda=zip(lbls, vals))
 def __train_models(self):
     # Now load all sentences from specific domain, and train TFIDF model and NGramPerplexity model.
     self.ngp = NGramPerplexity()
     self.tfidf = TFIDF()
     print("Training models from specific corpora")
     for file in os.listdir(self.input_dir):
         print("Training models from specific corpora: " + file)
         with open(self.input_dir + "/" + file, encoding="utf-8") as input:
             for line in input:
                 words = WordExtractor.get_words(line)
                 if len(words) == 0:
                     continue
                 self.sentences.append(words)
                 self.ngp.train_from_text(words)
                 self.tfidf.train_from_text(words)
예제 #11
0
def parsing():

    with open('data/test.csv', 'r') as csv_par:
        preproses()
        td = TFIDF([xdata, ydata])
        rowdata = []
        clasification = []
        csv_reader = csv_par.read().split('\n')
    for row in csv_reader:
        rowdata.append(row)
        clasification.append(testFromTrained([td.transform(row)]))

    keras.clear_session()
    labels, values = np.unique(clasification, return_counts=True)
    lbls, vals = np.unique(clasification, return_counts=True)

    pie_labels = labels
    pie_values = values
    colors = ["#F7464A", "#46BFBD"]

    return render_template('hasil.html',
                           set=zip(values, labels, colors),
                           clasification=zip(csv_reader, clasification),
                           legenda=zip(lbls, vals))
예제 #12
0
class Analysis:
    def __init__(self):
        self.preparation = Preparation()
        self.tfidf = TFIDF()

    def get_dataframe_from_json(self, json_data) -> pd.DataFrame:
        # convert json file to a pandas dataframe
        dataframe = self.preparation.jsonfile_to_dataframe(
            json.load(json_data))

        # refine the dataframe by removing entries that containt no articles
        dataframe = self.preparation.refine_dataframe(dataframe)
        return dataframe

    def get_tfidf_from_dataframe(self,
                                 dataframe: pd.DataFrame) -> pd.DataFrame:
        return self.tfidf.get_tfidf_dataframe(dataframe)
예제 #13
0
def wordcount(filename, ent_file, tfidf, text, id):
  resources = open(filename)
  resources.readline() # header
  wordcount = TFIDF(get_entities(ent_file))
  for id, lines in groupby(csv.reader(resources), id):
    maintext = ' '.join(text(line).lower() for line in lines)
    wordcount.process(maintext)
  wordcount.done()

  out = open(tfidf, 'w')
  for word, _, _, tfidf in wordcount.highest(200):
    out.write('%s\t%f\n' % (word, tfidf))
예제 #14
0
    def test_export(self):
        """
        Test exporting and importing the IDF table.
        """

        idf = {'a': 2, 'b': 1, 'c': 1}
        tfidf = TFIDF(idf, 3)

        e = tfidf.to_array()
        self.assertEqual(tfidf.global_scheme.documents,
                         TFIDF.from_array(e).global_scheme.documents)
        self.assertEqual(tfidf.global_scheme.idf,
                         TFIDF.from_array(e).global_scheme.idf)
        self.assertEqual(tfidf.local_scheme.__dict__,
                         TFIDF.from_array(e).local_scheme.__dict__)
        self.assertEqual(tfidf.global_scheme.__dict__,
                         TFIDF.from_array(e).global_scheme.__dict__)
예제 #15
0
def getRecommendation(new_df, record):
    temp_df = new_df[['id','name', 'album', 'artist', 'release_date']]
    temp_df = pd.concat([temp_df, record], ignore_index = True)
    
    col = ['name', 'album', 'artist', 'release_date']
    data = pd.DataFrame(columns=col)
    id = []
    for i in col:
        yield "<br/>"
        tf = TFIDF(temp_df, i)
        cosine_sim = linear_kernel(tf, tf) 
        data[i] = cosine_sim[-1]
        d1 = data.sort_values(by=[i], ascending=False)
        id.append(list(d1.head(7).index))
    
    tid = []
    for i in range(4):
        track_id = []
        for j in id[i]:
            track_id.append(temp_df.iloc[j, 0]) 
        tid.append(track_id)
    return tid 
예제 #16
0
def count(district,
          type='essays',
          extract_text=lambda line: ' '.join(line[3:10]),
          id=lambda line: line[0]):
    (_projectid, _teacher_acctid, _schoolid, school_ncesid, school_latitude,
     school_longitude, school_city, school_state, school_zip, school_metro,
     school_district, school_county, school_charter, school_magnet,
     school_year_round, school_nlns, school_kipp, school_charter_ready_promise,
     teacher_prefix, teacher_teach_for_america, teacher_ny_teaching_fellow,
     primary_focus_subject, primary_focus_area, secondary_focus_subject,
     secondary_focus_area, resource_usage, resource_type, poverty_level,
     grade_level, vendor_shipping_charges, sales_tax,
     payment_processing_charges, fulfillment_labor_materials,
     total_price_excluding_optional_support,
     total_price_including_optional_support, students_reached,
     used_by_future_students, total_donations, num_donors,
     eligible_double_your_impact_match, eligible_almost_home_match,
     funding_status, date_posted, date_completed, date_thank_you_packet_mailed,
     date_expiration) = range(46)
    proj_ids = []
    projects = open('../data/projects.%scsv' % district)
    projects.readline().strip()  # header
    for proj in csv.reader(projects):
        if proj[date_posted].startswith('2011'):
            proj_ids.append(proj[0])
    proj_ids = frozenset(proj_ids)
    projects.close()

    wordcount = TFIDF(get_entities(ent_file))
    essays = open('../data/%s.%scsv' % (type, district))
    essays.readline()  # header
    for proid, lines in groupby(csv.reader(essays), id):
        if proid in proj_ids:
            text = ' '.join(extract_text(line) for line in lines).lower()
            wordcount.process(text)
    wordcount.done()
    essays.close()

    out = open('../data/wc_%s%scsv' % (type, district), 'w')
    for word, tf, df, tfidf in wordcount.highest(0):
        out.write('%s\t%f\t%f\t%f\n' % (word, tf, df, tfidf))
예제 #17
0
파일: tagger.py 프로젝트: xqk/tag-generator
class Tagger:
    def __init__(self):
        self.documents = {}
        self.tfidf = TFIDF()

    def add_document(self, document):
        self.documents[document.id] = document

    def display(self):
        for id in self.documents:
            self.documents[id].display()

    def get_terms_weighted_by_tfidf(self, document):
        documents = [self.documents[key] for key in self.documents]
        tfidf_list = self.tfidf.calculate_tfidf_document(documents, document)
        weighted_terms = {}
        for d in tfidf_list:
            term = d["term"]
            tf = d["tf"]
            idf = d["idf"]
            weighted_terms[term] = tf * idf
        return weighted_terms

    def get_tags_using_weighted_terms(self, weighted_terms, size=5):
        sorted_terms = sorted(weighted_terms.items(),
                              key=operator.itemgetter(1),
                              reverse=True)
        length = len(weighted_terms)
        size = length if size > length else size
        tags = []
        for i in range(size):
            tags.append(sorted_terms[i][0])
        return tags

    def __str__(self):
        return str(pprint(vars(self)))
예제 #18
0
def count(district, type='essays', extract_text=lambda line: ' '.join(line[3:10]), id=lambda line:line[0]):
  (_projectid,_teacher_acctid,_schoolid,school_ncesid,school_latitude,school_longitude,school_city,school_state,school_zip,school_metro,school_district,school_county,school_charter,school_magnet,school_year_round,school_nlns,school_kipp,school_charter_ready_promise,teacher_prefix,teacher_teach_for_america,teacher_ny_teaching_fellow,primary_focus_subject,primary_focus_area,secondary_focus_subject,secondary_focus_area,resource_usage,resource_type,poverty_level,grade_level,vendor_shipping_charges,sales_tax,payment_processing_charges,fulfillment_labor_materials,total_price_excluding_optional_support,total_price_including_optional_support,students_reached,used_by_future_students,total_donations,num_donors,eligible_double_your_impact_match,eligible_almost_home_match,funding_status,date_posted,date_completed,date_thank_you_packet_mailed,date_expiration) = range(46)
  proj_ids = []
  projects = open('../data/projects.%scsv' % district)
  projects.readline().strip() # header
  for proj in csv.reader(projects):
    if proj[date_posted].startswith('2011'):
      proj_ids.append(proj[0])
  proj_ids = frozenset(proj_ids)
  projects.close()

  wordcount = TFIDF(get_entities(ent_file))
  essays = open('../data/%s.%scsv' % (type, district))
  essays.readline() # header
  for proid, lines in groupby(csv.reader(essays), id):
    if proid in proj_ids:
      text = ' '.join(extract_text(line) for line in lines).lower()
      wordcount.process(text)
  wordcount.done()
  essays.close()

  out = open('../data/wc_%s%scsv' % (type, district), 'w')
  for word, tf, df, tfidf in wordcount.highest(0):
    out.write('%s\t%f\t%f\t%f\n' % (word, tf, df, tfidf))
예제 #19
0
def get_pred_api_set(desc):
    tfidf = TFIDF(desc).gen_vector()
    cluster = Match(tfidf).match()
    topN = TopN(cluster).get()
    return set(topN)
예제 #20
0
			static.upload_folder(Sm_Cover_Dir, overwrite=True)
			static.upload_folder(Bg_Cover_Dir, overwrite=True)
			logger.info("update static server success !")

			with NewsDB() as db:
				db.update_table_newsContent(method="rebuild", fromCache=False)
				db.update_table_newsDetail(method="update")'''

			with NewsDB() as db: # 不更新 static 只更新 DB
				db.update_table_newsInfo(method="rebuild", fromCache=False)
				db.update_table_newsContent(method="rebuild", fromCache=False)
				db.update_table_newsDetail(method="update")

			WhooshIdx().create_idx()
			logger.info("update TFIDF ...")
			tfidf = TFIDF().init_for_update()
			tfidf.update()
			logger.info("update TFIDF success !")

		else: # 用于日常更新
			with NewsDB() as db:
				db.update_table_newsInfo(fromCache=False)
				newsIDs = db.get_newsIDs()

			# 更新id后马上先更新静态服务器,避免在更新空档期用户访问图片造成404界面缓存
			logger.info("update static server ...")
			static = StaticManager(newsIDs)
			static.download_covers()
			static.to_jpeg()
			static.cv_compress_sm() # 直接输出即可
			static.cv_compress_bg()
예제 #21
0
from tfidf import TFIDF
from match import Match
from topN import TopN
import sys

desc = sys.argv[1]

# online phase step 1
tfidf = TFIDF(desc).gen_vector()

# online phase step 2
cluster = Match(tfidf).match()

# online phase step 3
topN = TopN(cluster).get()
for i in topN:
	print(i)
예제 #22
0
import pickle

from TrainingWithTFIDF import TFIDFTrainer
tfidfTrainer = TFIDFTrainer()
from FeatureExtractionWithTFIDF import TFIDFPreparer
from tfidf import TFIDF
tfidfInstance = TFIDF()
import nltk
tfidfPreparer = TFIDFPreparer()


class IntentDetector:
    def prepareForNLP(self, text):
        sentences = nltk.sent_tokenize(text)
        sentences = [nltk.word_tokenize(sent) for sent in sentences]
        sentences = [nltk.pos_tag(sent) for sent in sentences]
        return sentences

    def getFilterChunk(self, sentence):
        chunkToExtract = """
            pattern:
           
            {<NNP|NNS|NN><WDT>?<VBP|VBZ>?<JJR>?<IN><CD><CC>?<CD>?}
               """

        parser = nltk.RegexpParser(chunkToExtract)
        result = parser.parse(sentence)

        chunks = []
        for subtree in result.subtrees():
            if subtree.label() == 'pattern':
def start() :

    #initialize TFIDF
    tfidf = TFIDF("tfidf_data/name_and_abstracts.txt")
    print("TFIDF initialized")

    #input_file = open("publications.txt")
    input_file = open("pub_min.txt")
    while True :
        '''
            Parse paper title.
            Test for EOF.
        '''
        line = input_file.readline().strip()
        if len(line) == 0 :
            break
        assert line[:2] == "#*"
        title = line[2:]
        toks = word_tokenize(title)
        toks = sorted(toks, key=lambda x : tfidf.tf_idf(x), reverse = True)
        print "sorted toks:"+str(toks)

        '''
            Parse author.
        '''
        line = input_file.readline().strip()
        assert line[:2] == "#@"
        authors = line[2:].split(',')

        '''
            Parse Year
        '''
        input_file.readline()

        '''
            Parse Venue
        '''
        line = input_file.readline().strip()
        assert line[:2] == "#c"
        venue = line[2:]

        '''
            Parse paper id.
            Do not cast to integer. Simply unnecessary.
        '''
        line = input_file.readline().strip()
        assert line[:6] == "#index"
        id = line[6:]

        for a in authors :
            dictionary_add_set(author_papers, a, id)
            dictionary_add_set(author_venues, a, venue)
        dictionary_add_set(venue_papers, venue, id)

        paper_venue[id] = venue
        paper_authors[id] = authors

        '''
            Parse citations.
        '''
        line = input_file.readline().strip()
        while line[:2] == "#%" :
            '''
                Invalid/empty citation.
            '''
            if len(line) <= 2 :
                break
            dictionary_add_set(paper_papers, id, line[2:])
            line = input_file.readline().strip()

        '''
            Read the empty string line so the readline output is not confused with
            EOF.
            Sets the reading pointer to the next paper's title line.
        '''
        line = input_file.readline()
        if line[:2] == "#!" :
            input_file.readline()

    return paper_authors, \
           paper_papers, \
           paper_venue, \
           author_papers, \
           venue_papers, \
           author_venues
def start(tfidf_threshold):

    #initialize TFIDF
    phrase_file = open("text_segmented_by_phrase.txt", "r")
    for line in phrase_file:
        index, text = line.split("##")
        token_list = text.lower().strip().split("!!")
        id_phrases[index] = token_list
    phrase_file.close()
    tfidf = TFIDF(id_phrases.values())
    print("TFIDF initialized")

    input_file = open("publications.txt")
    #input_file = open("pub_min.txt")
    while True:
        '''
            Parse paper title.
            Test for EOF.
        '''
        line = input_file.readline().strip()
        if len(line) == 0:
            break
        assert line[:2] == "#*"
        title = line[2:]
        '''
            Parse author.
        '''
        line = input_file.readline().strip()
        assert line[:2] == "#@"
        authors = line[2:].split(',')
        '''
            Parse Year
        '''
        input_file.readline()
        '''
            Parse Venue
        '''
        line = input_file.readline().strip()
        assert line[:2] == "#c"
        venue = line[2:]
        '''
            Parse paper id.
            Do not cast to integer. Simply unnecessary.
        '''
        line = input_file.readline().strip()
        assert line[:6] == "#index"
        id = line[6:]
        id_title[id] = title

        for a in authors:
            dictionary_add_set(author_papers, a, id)
            dictionary_add_set(author_venues, a, venue)
        dictionary_add_set(venue_papers, venue, id)

        paper_venue[id] = venue
        paper_authors[id] = authors
        '''
            Parse citations.
        '''
        line = input_file.readline().strip()
        while line[:2] == "#%":
            '''
                Invalid/empty citation.
            '''
            if len(line) <= 2:
                break
            dictionary_add_set(paper_papers, id, line[2:])
            line = input_file.readline().strip()
        '''
            Read the empty string line so the readline output is not confused with
            EOF.
            Sets the reading pointer to the next paper's title line.
        '''
        line = input_file.readline()
        if line[:2] == "#!":
            input_file.readline()
    '''
        Get terms for each paper.
    '''
    phrase_file = open("text_segmented_by_phrase.txt", "r")
    for paper_id, tok_list in id_phrases.items():
        '''
            Assuming (id, list_of_tokens). If I'm wrong, the code will HCF.
        '''
        toks = [x for x in tok_list if len(x) > 2 and \
                                    tfidf.tf_idf(x) > tfidf_threshold]
        toks = sorted(toks, key=lambda x: tfidf.tf_idf(x), reverse=False)
        paper_terms[paper_id] = toks[:min(3, len(toks))]
        for term in paper_terms[paper_id]:
            if not term_papers.has_key(term):
                term_papers[term] = []
            term_papers[term].append(paper_id)

    return paper_authors, \
           paper_papers, \
           paper_venue, \
           author_papers, \
           venue_papers, \
           author_venues
예제 #25
0
class Analiser:

    xData = []
    yData = []

    def __init__(self, training_data='dataset\processedDataset_pool2.csv'):
        self.preprocess(training_data)
        return None

    def preprocess(self, filepath):
        dataset = pd.read_csv(filepath, delimiter=',')

        self.xData = []
        self.yData = []

        for k in dataset['Kalimat']:
            self.xData.append(k)

        for k in dataset['Formalitas']:
            self.yData.append(k)

        self.tfidf_data = TFIDF([self.xData, self.yData])

    def save_model(self, model, file_name='model'):
        self.model_load = model

        model_json = model.to_json()
        with open('model/' + file_name + '.json', 'w') as json_file:
            json_file.write(model_json)

        model.save_weights('model/' + file_name + '.h5')
        print("Save model to disk")

    def load_model(self, file_name='model'):
        model = Sequential()

        json_file = open('model/' + file_name + '.json', 'r')
        loaded_model_json = json_file.read()

        json_file.close()
        model = model_from_json(loaded_model_json)

        model.load_weights('model/' + file_name + '.h5')
        print("Loaded model from disk")

        self.model_load = model
        return model

    def train(self, output_file='model'):
        x = self.tfidf_data.getOnlyXData()
        y = []

        for i in self.yData:
            if i == 1:
                y.append([1, 0])
            else:
                y.append([0, 1])

        model = Sequential()

        input_data_dimen = len(x[0])
        input_data_dimen = 1800 if input_data_dimen > 1800 else input_data_dimen

        model.add(
            Dense(units=int(0.39 * input_data_dimen),
                  activation='tanh',
                  input_dim=input_data_dimen))

        model.add(
            Dense(units=int(0.075 * 0.39 * input_data_dimen),
                  activation='tanh'))

        model.add(Dense(units=2, activation='softmax'))

        learning_rate = .025
        batch_size = 16
        loss_error = 'categorical_crossentropy'
        epoch = 20

        sgd = SGD(lr=learning_rate)

        model.compile(optimizer=sgd, loss=loss_error, metrics=['accuracy'])

        x_train, x_test, y_train, y_test = self.train_custom_split(x, y, 0.5)

        self.history = model.fit(x=x_train,
                                 y=y_train,
                                 validation_data=(x_test, y_test),
                                 batch_size=batch_size,
                                 nb_epoch=epoch)

        self.save_model(model, output_file)

    def train_custom_split(self, x, y, sr_train, test_ratio=0.2):
        dataset = []

        for i in range(len(y)):
            dataset.append([x[i], y[i]])

        shuffle(dataset)

        formal = []
        informal = []

        for i in range(len(dataset)):
            if dataset[i][1][0] == 0:
                informal.append(dataset[i])
            else:
                formal.append(dataset[i])

        x_train = []
        x_test = []
        x_test_temp = []
        y_train = []
        y_test = []
        y_test_temp = []

        formal_len = len(formal)
        formal_rat = formal_len * sr_train
        inform_len = len(informal)
        inform_rat = inform_len * sr_train

        for i in range(formal_len):
            if i < formal_rat:
                x_train.append(formal[i][0])
                y_train.append(formal[i][1])
            else:
                x_test_temp.append(formal[i][0])
                y_test_temp.append(formal[i][1])

        for i in range(inform_len):
            if i < inform_rat:
                x_train.append(informal[i][0])
                y_train.append(informal[i][1])
            else:
                x_test_temp.append(informal[i][0])
                y_test_temp.append(informal[i][1])

        test_len = len(y_test_temp)
        test_rat = test_ratio * test_len

        for i in range(test_len):
            if i >= test_rat:
                x_train.append(x_test_temp[i])
                y_train.append(y_test_temp[i])
            else:
                x_test.append(x_test_temp[i])
                y_test.append(y_test_temp[i])

        return np.array(x_train), np.array(x_test), np.array(
            y_train), np.array(y_test)

    def getBinaryResult(self, x):
        print(x)
        return "FORMAL" if x[0][0] > x[0][1] else "NON FORMAL"

    def testFromTrained(self, x):
        if self.model_load == 'None':
            print("Model tidak ditemukan!")
            exit(0)

        return self.getBinaryResult(self.model_load.predict_proba(np.array(x)))

    def showPlot(self):
        history = self.history

        # for plotting model accuracy
        plt.plot(history.history['acc'])
        plt.plot(history.history['val_acc'])
        plt.title('Model Accuracy')
        plt.ylabel('accuracy')
        plt.xlabel('epoch')
        plt.legend(['train', 'test'], loc='upper left')
        plt.show()

        # for plotting model loss
        plt.plot(history.history['loss'])
        plt.plot(history.history['val_loss'])
        plt.title('Model Loss')
        plt.ylabel('loss')
        plt.xlabel('epoch')
        plt.legend(['train', 'test'], loc='upper left')

        plt.show()
예제 #26
0
 def calcTFIDF(self):
     t = TFIDF()
     self.tfidf = t.docHandler(self.inverted_index, self.unique_id)
class DomainSimilarity:
    def __init__(self, input_dir, threshold_tfidf, threshold_perplexity_ngram,
                 threshold_edit_distance):
        self.threshold_tfidf = 1 - threshold_tfidf
        self.threshold_perplexity_ngram = threshold_perplexity_ngram
        self.threshold_edit_distance = threshold_edit_distance
        self.input_dir = input_dir
        self.sentences = []
        if not os.path.isdir(input_dir):
            raise Exception("The provided dir " + str(input_dir) +
                            " does not exist")
        self.__train_models()

        self.queries_asked = 0
        self.sentences_asked = 0
        self.accepted_by_tfidf = 0
        self.accepted_by_ngp = 0
        self.accepted_by_edit_distance = 0
        self.sum_tfidf = 0
        self.sum_ngp = 0
        self.sum_edit = 0

    def __train_models(self):
        # Now load all sentences from specific domain, and train TFIDF model and NGramPerplexity model.
        self.ngp = NGramPerplexity()
        self.tfidf = TFIDF()
        print("Training models from specific corpora")
        for file in os.listdir(self.input_dir):
            print("Training models from specific corpora: " + file)
            with open(self.input_dir + "/" + file, encoding="utf-8") as input:
                for line in input:
                    words = WordExtractor.get_words(line)
                    if len(words) == 0:
                        continue
                    self.sentences.append(words)
                    self.ngp.train_from_text(words)
                    self.tfidf.train_from_text(words)

    def print_progress(self):
        print("Average tfidf: " + str(1 - self.sum_tfidf / self.queries_asked))
        print("Average ngram-perplexity: " +
              str(self.sum_ngp / self.sentences_asked))
        print("Average edit-distance: " +
              str(self.sum_edit / self.queries_asked))
        print("Accept percent by tfidf extractor: " +
              Formatter.percent(self.accepted_by_tfidf / self.sentences_asked))
        print("Accept percent by ngram-perplexity extractor: " +
              Formatter.percent(self.accepted_by_ngp / self.sentences_asked))
        print("Accept percent by edit-distance extractor: " +
              Formatter.percent(self.accepted_by_edit_distance /
                                self.sentences_asked))

    def accepts_sentence(self, words_general):
        # sentence_general: string
        # Returns True if similarity of sentence_general is either:
        # > threshold1 according to tf-idf of one of stored sentences
        # > threshold2 according to ngramperplexity of one of stored sentences
        # > threshold3 according to levenshtein of one of stored sentences
        self.sentences_asked += 1
        accept_ngp = False
        accept_tfidf = False
        accept_edit_distance = False

        perplexity = self.ngp.calc_perplexity(words_general)
        self.sum_ngp += perplexity
        if perplexity <= self.threshold_perplexity_ngram:
            if RUN_CONFIGURATION.mode == MODE.TURBO:
                return True
            self.accepted_by_ngp += 1
            accept_ngp = True

        for words_specific in self.sentences:
            self.queries_asked += 1
            if accept_tfidf and accept_edit_distance:
                return True
            if not accept_tfidf:
                sim = self.tfidf.calc_cosine_similarity(
                    words_general, words_specific)
                self.sum_tfidf += sim
                if sim >= self.threshold_tfidf:
                    if RUN_CONFIGURATION.mode == MODE.TURBO:
                        return True
                    self.accepted_by_tfidf += 1
                    accept_tfidf = True
            if not accept_edit_distance:
                edit_distance = Levenshtein.normalized_distance(
                    words_general, words_specific)
                self.sum_edit += edit_distance
                if edit_distance <= self.threshold_edit_distance:
                    if RUN_CONFIGURATION.mode == MODE.TURBO:
                        return True
                    self.accepted_by_edit_distance += 1
                    accept_edit_distance = True

        if accept_tfidf or accept_ngp or accept_edit_distance:
            return True

        return False
from tfidf import TFIDF
from wordextractor import WordExtractor

def calc_most_similar(tfidf, query, docs):
    best_doc = ""
    max_sim = 0
    for doc in docs:
        sim = tfidf.calc_cosine_similarity(WordExtractor.get_words(query), WordExtractor.get_words(doc))
        if sim > max_sim:
            max_sim = sim
            best_doc = doc
    return best_doc

tfidf = TFIDF()
d1 = "It is a great day today"
d2 = "The weather is absolutely great"
d3 = "It is so warm today, almost too hot"
d4 = "We're very happy with the weather today"
d5 = "It is a great day to be at the beach!"
d6 = "We should get out and enjoy the weather right now :)"
d7 = "I've bought a radio I plan to bring to the beach today"
d8 = "The beach is a bit crowded"
d9 = "There's many kids at the beach today"
d10 = "If it starts to rain at the beach, I will go home"

documents = [d1, d2, d3, d4, d5, d6, d7, d8, d9, d10]
for doc in documents:
    words = WordExtractor.get_words(doc)
    tfidf.train_from_text(words)

q1 = "We have not had rain for a long time!"
예제 #29
0
    loaded_model_json = json_file.read()
    json_file.close()
    model = model_from_json(loaded_model_json)

    # load weights into new self.model
    model.load_weights("model/model.h5")
    print("Loaded model from disk")

    sgd = SGD(lr=0.01)

    model.compile(loss='binary_crossentropy', optimizer=sgd)
    return getBinaryResult(model.predict_proba(np.array(x)))


preproses()
td = TFIDF([xdata, ydata])

# TRAINING
# train(td.getOnlyX(), ydata)

# RETRAINING
# retrain_model(td.getOnlyX(), ydata)

# TESTING
test = "ahok itu pemimpin yang beres memimpin"
print test
print testFromTrained([td.transform(test)])

test = "ahok itu pemimpin yang ga beres memimpin"
print test
print testFromTrained([td.transform(test)])
예제 #30
0
파일: ucs.py 프로젝트: yhjw88/deepreader
def reward2(s1, s2):
    indices = corpus[:]
    tfi = TFIDF()
    tfidf = tfi.get_tfidf(corpus)
    score = tfi.relevancy(tfidf, indices, s1, s2)
    return score + 1
예제 #31
0
class Query:

    def __init__(self, queryString=""):
        print "Constructing Query Object!"
        self.invIndex = InvertedIndex()
        self.tfidf = TFIDF()
        self.query = queryString

    ###################### Other Functions ######################
    def removeMissingQueryTerms(self, queryList, invIndex):
        removedWords = []
        for word in queryList:
            isWordInIndex = invIndex.get(word, -1)
            if isWordInIndex == -1:
                removedWords.append(word)
                queryList.remove(word)
        return removedWords

    ###################### Query Functions ######################
    def normQueryTF(self, frequencyDict, numTerms):
        print numTerms, frequencyDict
        for key, val in frequencyDict.iteritems():
            frequencyDict[key] = val/numTerms
        return frequencyDict

    def calcQueryCollectionFrequency(self, queryWords):
        collectionFrequency = 0
        frequencyDict = {}
        for i, word in enumerate(queryWords):
            for j, checkWord in enumerate(queryWords):
                if word == checkWord:
                    collectionFrequency += 1
            frequencyDict[word] = collectionFrequency
            collectionFrequency = 0
        return frequencyDict

    def calcQueryIDF(self, freqDict, numDocs):
        tempIDF = {}
        for key, value in freqDict.iteritems():
            #df is 1 because only one document (query) for terms to be in
            tempIDF[key] = 1 + math.log10(numDocs/1)
        return tempIDF

    def calcQueryTF_IDF(self, normFrequencyDict, queryIDF):
        tf_idf = {}
        for key, value in normFrequencyDict.iteritems():
            tf_idf[key] = normFrequencyDict[key] * queryIDF[key]
        return tf_idf

    def queryHandler(self, query, invIndex):
        # print "Initializing Query Handler..."
        queryWordList = query.split(' ')
        # print "Query Word List after split"
        # print queryWordList
        #Remove stop words, clean case/punct, stemm
        print "Parsing Query Words..."
        parser = Parser()
        queryWordList = parser.fullParse(queryWordList)
        # print "Query Word List after parse"
        # print queryWordList
        # print "Parsed Query Words..."

        removedWords = self.removeMissingQueryTerms(queryWordList, invIndex)
        if len(queryWordList) == 0:
            return -1
        print "These words were not found and removed from the query: ", removedWords
        print "Updated Query Words List", queryWordList


        numTerms = len(queryWordList)
        numDocs = self.tfidf.findNumDocs(invIndex)

        freqDict = self.calcQueryCollectionFrequency(queryWordList)
        normFrequencyDict = self.normQueryTF(freqDict, numTerms)

        #I believe this is always 1 for a query because df is number of docs in collection with term and query is one doc and the only doc in the collection
        #calcQueryDF(temp)
        queryIDF = self.calcQueryIDF(freqDict, numDocs)
        queryTF_IDF = self.calcQueryTF_IDF(normFrequencyDict, queryIDF)

        return queryTF_IDF

    ###################### Cosine Similarity Functions ######################
    def calcQueryDocDotProduct(self, docTF_IDF, queryTF_IDF):
        docDotProducts = {}
        dotProd = 0

        for key, value in queryTF_IDF.iteritems():
            isKeyInDoc = docTF_IDF.get(key, -1)
            if isKeyInDoc != -1:
                for k, v in docTF_IDF[key].iteritems():
                    isDocIdInTempDotProd = docDotProducts.get(k, -1)
                    if isDocIdInTempDotProd == -1:
                        docDotProducts[k] = value * v
                    else:
                        docDotProducts[k] += value * v
            else:
                print key, "not found in index"

        return docDotProducts

    def calcQueryEuclideanLength(self, queryTF_IDF):
        tempLength = 0
        for key, value in queryTF_IDF.iteritems():
            tempLength += value * value
        return math.sqrt(tempLength)

    def calcDocEuclideanLength(self, docTF_IDF, queryTF_IDF):
        tempLength = 0
        tempDocEuclideanLength = {}

        for key, value in queryTF_IDF.iteritems():
            for k, v in docTF_IDF[key].iteritems():
                isDocInDict = tempDocEuclideanLength.get(k, -1)
                if isDocInDict == -1:
                    tempDocEuclideanLength[k] = v*v
                else:
                    tempDocEuclideanLength[k] += v*v

        for newKey, newVal in tempDocEuclideanLength.iteritems():
            tempDocEuclideanLength[newKey] = math.sqrt(tempDocEuclideanLength[newKey])
        return tempDocEuclideanLength



    def calcCosSimilarity(self, queryDocDotProducts, docLength, queryLength):
        tempCosSimDocs = {}
        tempCosSimVal = 0

        for key, value in queryDocDotProducts.iteritems():
            tempCosSimDocs[key] = (queryDocDotProducts[key])/(docLength[key]*queryLength)
        return tempCosSimDocs

    def cosSimilarityHandler(self, docTF_IDF, queryTF_IDF):
        print "Initializing Cosine Similarity Handler..."
        queryDocDotProducts = self.calcQueryDocDotProduct(docTF_IDF, queryTF_IDF)
        docLength = self.calcDocEuclideanLength(docTF_IDF, queryTF_IDF)
        queryLength = self.calcQueryEuclideanLength(queryTF_IDF)

        return self.calcCosSimilarity(queryDocDotProducts, docLength, queryLength)

    def parseQuery(self, query, invIndex):
        #Both handlers return the respective TF_IDFs
        #docTF_IDF can be run once after crawl
        tfidf = TFIDF()
        # print invIndex
        docTF_IDF = tfidf.docHandler(invIndex, 0)
        # print docTF_IDF
        queryTF_IDF = self.queryHandler(query, invIndex)
        if queryTF_IDF == -1:
            print "No words from your search were found in any documents...Please try new search terms!"
            return -1

        cosSimByDoc = self.cosSimilarityHandler(docTF_IDF, queryTF_IDF)
        # print "Cosine Similarity by document:", cosSimByDoc
        return cosSimByDoc



    def printDictionaries(self, d):
        print "{:<8} {:<10}".format('DocID ,','Number')
        for k, v in d.iteritems():
            num = v
            print "{:<8}{:<10}".format(k, num)
예제 #32
0
from tfidf import TFIDF
from cluster import Cluster
from matrices import Matrices
from topN import TopN

# offline phase step 1
TFIDF.gen_vector()
Cluster.gen_vector()

# offline phase step 2
Matrices.gen_matrices()

#offline phase step 3 & step 4
TopN.gen_topN()
def start(tfidf_threshold) :

    #initialize TFIDF
    phrase_file = open("text_segmented_by_phrase.txt", "r")
    for line in phrase_file :
        index, text = line.split("##")
        token_list = text.lower().strip().split("!!")
        id_phrases[index] = token_list
    phrase_file.close()
    tfidf = TFIDF(id_phrases.values())
    print("TFIDF initialized")

    input_file = open("publications.txt")
    #input_file = open("pub_min.txt")
    while True :
        '''
            Parse paper title.
            Test for EOF.
        '''
        line = input_file.readline().strip()
        if len(line) == 0 :
            break
        assert line[:2] == "#*"
        title = line[2:]


        '''
            Parse author.
        '''
        line = input_file.readline().strip()
        assert line[:2] == "#@"
        authors = line[2:].split(',')

        '''
            Parse Year
        '''
        input_file.readline()

        '''
            Parse Venue
        '''
        line = input_file.readline().strip()
        assert line[:2] == "#c"
        venue = line[2:]

        '''
            Parse paper id.
            Do not cast to integer. Simply unnecessary.
        '''
        line = input_file.readline().strip()
        assert line[:6] == "#index"
        id = line[6:]
        id_title[id] = title

        for a in authors :
            dictionary_add_set(author_papers, a, id)
            dictionary_add_set(author_venues, a, venue)
        dictionary_add_set(venue_papers, venue, id)

        paper_venue[id] = venue
        paper_authors[id] = authors

        '''
            Parse citations.
        '''
        line = input_file.readline().strip()
        while line[:2] == "#%" :
            '''
                Invalid/empty citation.
            '''
            if len(line) <= 2 :
                break
            dictionary_add_set(paper_papers, id, line[2:])
            line = input_file.readline().strip()

        '''
            Read the empty string line so the readline output is not confused with
            EOF.
            Sets the reading pointer to the next paper's title line.
        '''
        line = input_file.readline()
        if line[:2] == "#!" :
            input_file.readline()

    '''
        Get terms for each paper.
    '''
    phrase_file = open("text_segmented_by_phrase.txt", "r")
    for paper_id, tok_list in id_phrases.items() :
        '''
            Assuming (id, list_of_tokens). If I'm wrong, the code will HCF.
        '''
        toks = [x for x in tok_list if len(x) > 2 and \
                                    tfidf.tf_idf(x) > tfidf_threshold]
        toks = sorted(toks, key=lambda x : tfidf.tf_idf(x), reverse = False)
        paper_terms[paper_id] = toks[: min(3, len(toks))]
        for term in paper_terms[paper_id] :
            if not term_papers.has_key(term) :
                term_papers[term] = []
            term_papers[term].append(paper_id)

    return paper_authors, \
           paper_papers, \
           paper_venue, \
           author_papers, \
           venue_papers, \
           author_venues
class DomainSimilarity:

    def __init__(self, input_dir, threshold_tfidf, threshold_perplexity_ngram, threshold_edit_distance):
        self.threshold_tfidf = 1-threshold_tfidf
        self.threshold_perplexity_ngram = threshold_perplexity_ngram
        self.threshold_edit_distance = threshold_edit_distance
        self.input_dir = input_dir
        self.sentences = []
        if not os.path.isdir(input_dir):
            raise Exception("The provided dir " + str(input_dir) + " does not exist")
        self.__train_models()

        self.queries_asked = 0
        self.sentences_asked = 0
        self.accepted_by_tfidf = 0
        self.accepted_by_ngp = 0
        self.accepted_by_edit_distance = 0
        self.sum_tfidf = 0
        self.sum_ngp = 0
        self.sum_edit = 0

    def __train_models(self):
        # Now load all sentences from specific domain, and train TFIDF model and NGramPerplexity model.
        self.ngp = NGramPerplexity()
        self.tfidf = TFIDF()
        print("Training models from specific corpora")
        for file in os.listdir(self.input_dir):
            print("Training models from specific corpora: " + file)
            with open(self.input_dir + "/" + file, encoding="utf-8") as input:
                for line in input:
                    words = WordExtractor.get_words(line)
                    if len(words) == 0:
                        continue
                    self.sentences.append(words)
                    self.ngp.train_from_text(words)
                    self.tfidf.train_from_text(words)

    def print_progress(self):
        print("Average tfidf: " + str(1 - self.sum_tfidf / self.queries_asked))
        print("Average ngram-perplexity: " + str(self.sum_ngp / self.sentences_asked))
        print("Average edit-distance: " + str(self.sum_edit / self.queries_asked))
        print("Accept percent by tfidf extractor: " + Formatter.percent(self.accepted_by_tfidf / self.sentences_asked))
        print("Accept percent by ngram-perplexity extractor: " + Formatter.percent(self.accepted_by_ngp / self.sentences_asked))
        print("Accept percent by edit-distance extractor: " + Formatter.percent(self.accepted_by_edit_distance / self.sentences_asked))

    def accepts_sentence(self, words_general):
        # sentence_general: string
        # Returns True if similarity of sentence_general is either:
        # > threshold1 according to tf-idf of one of stored sentences
        # > threshold2 according to ngramperplexity of one of stored sentences
        # > threshold3 according to levenshtein of one of stored sentences
        self.sentences_asked += 1
        accept_ngp = False
        accept_tfidf = False
        accept_edit_distance = False

        perplexity = self.ngp.calc_perplexity(words_general)
        self.sum_ngp += perplexity
        if perplexity <= self.threshold_perplexity_ngram:
            if RUN_CONFIGURATION.mode == MODE.TURBO:
                return True
            self.accepted_by_ngp += 1
            accept_ngp = True

        for words_specific in self.sentences:
            self.queries_asked += 1
            if accept_tfidf and accept_edit_distance:
                return True
            if not accept_tfidf:
                sim = self.tfidf.calc_cosine_similarity(words_general, words_specific)
                self.sum_tfidf += sim
                if sim >= self.threshold_tfidf:
                    if RUN_CONFIGURATION.mode == MODE.TURBO:
                        return True
                    self.accepted_by_tfidf += 1
                    accept_tfidf = True
            if not accept_edit_distance:
                edit_distance = Levenshtein.normalized_distance(words_general, words_specific)
                self.sum_edit += edit_distance
                if edit_distance <= self.threshold_edit_distance:
                    if RUN_CONFIGURATION.mode == MODE.TURBO:
                        return True
                    self.accepted_by_edit_distance += 1
                    accept_edit_distance = True

        if accept_tfidf or accept_ngp or accept_edit_distance:
            return True

        return False
예제 #35
0
    # read scrap_workbook
    scrap_workbook = read_scrap(args.scrap_file_name)
    #
    ## ES6
    ES6_sheet = scrap_workbook["蔚来ES6"]
    review_container = ReviewContainer(ES6_sheet)
    review_list = review_container.get_review_list()
    doc_word_count_info_list = build_doc_word_count_info_list(review_list)

    ## build model data structure
    term_container = TermContainer(doc_word_count_info_list)
    inverted_file = InvertedFile(term_container, doc_word_count_info_list)

    # build query
    query_list = get_query_list(args.query_expand_workbook_path)
    query_expand_impl = QueryExpandImpl(args.query_expand_workbook_path)
    set_topk_for_query_list(query_list, args.topk)
    apply_query_expand_to_query_list(query_list, query_expand_impl)

    # search
    tfidf_engine = TFIDF(review_container)
    apply_query_search_to_query_list(query_list, inverted_file, tfidf_engine,
                                     review_container)

    # output_workbook
    workbook = Workbook()
    update_workbook_for_query_list(query_list, review_container, workbook)
    workbook.remove(workbook['Sheet'])
    workbook.save(args.output_path)
예제 #36
0
파일: tagger.py 프로젝트: xqk/tag-generator
 def __init__(self):
     self.documents = {}
     self.tfidf = TFIDF()
예제 #37
0
 def __init__(self, queryString=""):
     print "Constructing Query Object!"
     self.invIndex = InvertedIndex()
     self.tfidf = TFIDF()
     self.query = queryString
예제 #38
0
    def createTrainingSet(self):

        # initialize one class SVM models for each intent
        from sklearn import svm
        windowModel = svm.OneClassSVM(nu=0.01, kernel="linear")
        filterModel = svm.OneClassSVM(nu=0.01, kernel="linear")
        aggregateModel = svm.OneClassSVM(nu=0.01, kernel="linear")
        groupModel = svm.OneClassSVM(nu=0.01, kernel="linear")

        from tfidf import TFIDF
        tfidfInstance = TFIDF()

        documents = []
        fdoc = []
        adoc = []
        wdoc = []
        gdoc = []

        import json
        with open('intents.json') as json_data:
            intentsData = json.load(json_data)
        for intent in intentsData['intents']:
            for pattern in intent['pattern']:
                documents.append(pattern)
                if intent['tag'] == "filter":
                    fdoc.append(pattern)
                if intent['tag'] == "window":
                    wdoc.append(pattern)
                if intent['tag'] == "aggre":
                    adoc.append(pattern)
                if intent['tag'] == "group":
                    gdoc.append(pattern)

        texts = []
        # words relevant to the stream. These words do not help in intent detection and must be removed
        from FeatureExtractionWithTFIDF import TFIDFPreparer
        tfidfPreparer = TFIDFPreparer()
        for doc in documents:
            text = tfidfPreparer.prepareTextForTFIDF(doc)
            texts.append(text)

        self.countVectorizer, self.idf = tfidfInstance.getIDF(documents)

        self.tfidf_filter = tfidfInstance.getTFIDF(fdoc, self.countVectorizer,
                                                   self.idf)
        self.tfidf_aggre = tfidfInstance.getTFIDF(adoc, self.countVectorizer,
                                                  self.idf)
        self.tfidf_window = tfidfInstance.getTFIDF(wdoc, self.countVectorizer,
                                                   self.idf)
        self.tfidf_group = tfidfInstance.getTFIDF(gdoc, self.countVectorizer,
                                                  self.idf)

        x_filter = []
        for i in range(len(fdoc)):
            total = tfidfPreparer.getSumOfCosineSimilarity(
                self.tfidf_filter[i], self.tfidf_filter)
            x_filter.append([total])
        x_aggre = []
        for i in range(len(adoc)):
            total = tfidfPreparer.getSumOfCosineSimilarity(
                self.tfidf_aggre[i], self.tfidf_aggre)
            x_aggre.append([total])
        x_window = []
        for i in range(len(wdoc)):
            total = tfidfPreparer.getSumOfCosineSimilarity(
                self.tfidf_window[i], self.tfidf_window)
            x_window.append([total])
        x_group = []
        for i in range(len(gdoc)):
            total = tfidfPreparer.getSumOfCosineSimilarity(
                self.tfidf_group[i], self.tfidf_group)
            x_group.append([total])

        filterModel.fit(x_filter)
        windowModel.fit(x_window)
        aggregateModel.fit(x_aggre)
        groupModel.fit(x_group)

        import pickle
        filename = 'finalized_windowModel.sav'
        pickle.dump(windowModel, open(filename, 'wb'))
        filename = 'finalized_filterModel.sav'
        pickle.dump(filterModel, open(filename, 'wb'))
        filename = 'finalized_aggregateModel.sav'
        pickle.dump(aggregateModel, open(filename, 'wb'))
        filename = 'finalized_groupModel.sav'
        pickle.dump(groupModel, open(filename, 'wb'))
예제 #39
0
if __name__ == '__main__':
    # Get command-line args
    args_ = get_setup_args()

    # Download resources
    download(args_)

    # Import spacy language model
    nlp = spacy.blank("en")

    # Keep all the docs for TF-IDF initilization
    tfidf_docs = []

    # Preprocess dataset
    args_.train_file = url_to_data_path(args_.train_url)
    args_.dev_file = url_to_data_path(args_.dev_url)
    if args_.include_test_examples:
        args_.test_file = url_to_data_path(args_.test_url)
    glove_dir = url_to_data_path(args_.glove_url.replace('.zip', ''))
    glove_ext = '.txt' if glove_dir.endswith('d') else '.{}d.txt'.format(
        args_.glove_dim)
    args_.glove_file = os.path.join(glove_dir,
                                    os.path.basename(glove_dir) + glove_ext)
    pre_process(args_)

    from tfidf import TFIDF
    print(len(tfidf_docs))
    tfidf_scorer = TFIDF(tfidf_docs)
    tfidf_scorer.prepare_data()
    tfidf_scorer.save_to_pickle()