def get_data(topic_tag): """ Get the training data from MongoDB. Returns a list of posts """ content, labels = [], [] params = ['clean', 'links', 'images', 'embed', 'ident', 'punct', 'misc', 'stop', 'encoding'] mongo_data = training_data.find({'Topic': topic_tag}) for entry in mongo_data: content.append(pc.parse_post(entry['Content'], params)) labels.append('1') num_posts = len(content) mongo_data = training_data.find({'Topic': 'Irrelevant'}) for entry in mongo_data: if len(content) <= num_posts * 2: content.append(pc.parse_post(entry['Content'], params)) #labels.append(entry['Topic']) labels.append('0') content = np.asarray(content) labels = np.asarray(labels) return content, labels
def code_posts(es_docs): contents, es_posts = [], [] es_docs = es_docs['hits']['hits'] for es_post in es_docs: contents.append(pc.parse_post(es_post['_source']['post']['content'], params)) contents = np.asarray(contents) temp_data = [] for es_post in es_docs: es_post['_source']['analysis'] = {'relevance': []} es_post.pop('_score', None) temp_data.append(es_post) es_docs = temp_data #TODO Only update the analysis field for topic in topic_models: predict = topic_models[topic].predict_proba(contents) temp_data = [] for es_post, code in zip(es_docs, predict): es_post['_source']['analysis']['relevance'].append({'topicid': topic, 'code': code[1]}) temp_data.append(es_post) es_docs = temp_data try: helpers.bulk(es, es_docs) except Exception as e: with open('temp.txt', 'a') as outfile: print 'Exception: logging' outfile.write('>Event Start:' + '\n' + str(e) + '\n') #outfile.write(str(es_docs)) return
def get_data(topic_name): """ Get the training data from MongoDB. Returns a list of posts """ ids, content, labels = [], [], [] params = ['clean', 'links', 'images', 'embed', 'ident', 'punct', 'misc', 'stop', 'encoding'] mongo_data = test_data.find({'Topic': topic_name}) for entry in mongo_data: ids.append(entry['PostId']) content.append(pc.parse_post(entry['Content'], params)) labels.append(topic_name) content = np.asarray(content) labels = np.asarray(labels) return ids, content, labels
while queries_complete is False: if first_query is True: es_response = es.search(index=index_name, body={'query': es_query}, scroll=ES_SCROLL_TIMEOUT, size=ES_DOC_COUNT) es_scroll_id = es_response['_scroll_id'] total_docs = es_response['hits']['total'] first_query = False else: es_response = es.scroll(scroll_id=es_scroll_id, scroll=ES_SCROLL_TIMEOUT) es_scroll_id = es_response['_scroll_id'] if len(es_response['hits']['hits']) == 0: queries_complete = True else: for entry in es_response['hits']['hits']: data_dict[entry['_id']]['content'] = pc.parse_post(entry['_source']['twitter']['text'], params) test_ids, test_user, test_content = [], [], [] for key in data_dict.keys(): test_ids.append(key) test_user.append(data_dict[key]['user_code']) test_content.append(data_dict[key]['content']) test_content = np.asarray(test_content) test_ids = np.asarray(test_ids) predicted = model.predict_proba(test_content) output_content = [] for postid, tweet_text, label, code in zip(test_ids, test_content, predicted, test_user):