def __init__(self): member_query = """ select * from expert_10_50 """ count_query = """ select count(*) from expert_10_50 """ self.query = """ select l.list_id,l.list_name,l.list_description from list_rec as l join list_member_rec as lm1 on lm1.list_id=l.list_id where lm1.member_id = %s; """ conn_string = "dbname='nlstudent' user = '******' password ='******'" self.connection = connect(conn_string) self.ind = 0 self.parser = StringParser() self.cursor = self.connection.cursor() self.cursor.execute(member_query) self.members = self.cursor.fetchall() self.cursor.execute(count_query) self.N_members = self.cursor.fetchone()[0] print(self.N_members)
import string from nltk import FreqDist import nltk from psycopg2._psycopg import DatabaseError import sys from parsers.string_parser import StringParser, latin_letters import psycopg2 __author__ = 'Katharine' conn_string = "dbname='nlstudent' user = '******' password ='******'" conn = psycopg2.connect(conn_string) cursor = conn.cursor() f = StringParser() s = {} get_listinfo_for_member = """SELECT l.list_name,l.list_description FROM list_rec as l JOIN list_member_rec as lm ON l.list_id = lm.list_id where lm.member_id = %s""" get_listcount_for_member = """SELECT count FROM member_list_count_rec as lm JOIN list_rec as l ON l.list_id = lm.list_id where lm.member_id = %s;""" get_listinfo_for_list = """SELECT l.list_name,l.list_description FROM list_rec as l where list_id = %s""" get_all_listinfo_for_all_lists = """SELECT l.list_id,l.list_name,l.list_description FROM list_rec as l"""
import string from nltk import FreqDist import nltk from psycopg2._psycopg import DatabaseError import sys from parsers.string_parser import StringParser, latin_letters import psycopg2 __author__ = 'Katharine' conn_string = "dbname='nlstudent' user = '******' password ='******'" conn = psycopg2.connect(conn_string) cursor = conn.cursor() f = StringParser() s = {} get_listinfo_for_member = """SELECT l.list_name,l.list_description FROM list_rec as l JOIN list_member_rec as lm ON l.list_id = lm.list_id where lm.member_id = %s""" # get_listcount_for_member = """SELECT count(lm.list_id) FROM list_member_rec as lm JOIN # list_rec as l ON l.list_id = lm.list_id # where lm.member_id = %s;""" """NB relies on table member_list_count_rec containing listcount for members """ get_listcount_for_members = """SELECT member_id,count FROM member_list_count_rec limit 1 offset 101;""" get_listinfo_for_list = """SELECT l.list_name,l.list_description FROM list_rec as l where list_id = %s"""
import nltk from parsers.string_parser import StringParser, latin_letters import psycopg2 __author__ = 'Katharine' # conn_string = "dbname='nlstudent' user = '******' password ='******'" # # conn = psycopg2.connect(conn_string) # cursor = conn.cursor() # read list names and descriptions from a file # reader = codecs.open('no_wf_rec.csv', encoding='utf-8') # reader_csv = csv.reader('no_wf_rec.csv','rb') csv_file = 'no_wf_rec.csv' f = StringParser() s = {} tstrout = '' # get_listinfo_for_member = """SELECT l.list_name,l.list_description FROM list_rec JOIN # # list_member_rec ON list_rec.list_id = list_member_rec.list_id # # where member_id = %s """ # cursor.execute(get_listinfo_for_member,21447363) with open(csv_file) as csvfile: dialect = csv.Sniffer().sniff(csvfile.read(1024)) csvfile.seek(0) reader = csv.reader(csvfile, dialect) # rows = cursor.fetchall() # for row in rows:
from django.core.validators import URLValidator from django.db.models import Q import requests __author__ = 'James' from bs4 import BeautifulSoup import feedparser from content_acquisition.models import FeedRec from articles.models import ArticleRec from content_acquisition.ArticleWrapper import ArticleWrapper from newspaper import Article, ArticleException from parsers.string_parser import StringParser clf = pickle.load(open('./pipe.p', 'rb')) parser = StringParser() from django import setup setup() val = URLValidator() def aggregate(): ArticleRec.objects.filter( article_published__lte=datetime.datetime.today() - datetime.timedelta(days=7)).delete() for f in shuffle(FeedRec.objects.all()): u = f.feed_url
class _ExpertCorpus(object): def __init__(self): member_query = """ select * from expert_10_50 """ count_query = """ select count(*) from expert_10_50 """ self.query = """ select l.list_id,l.list_name,l.list_description from list_rec as l join list_member_rec as lm1 on lm1.list_id=l.list_id where lm1.member_id = %s; """ conn_string = "dbname='nlstudent' user = '******' password ='******'" self.connection = connect(conn_string) self.ind = 0 self.parser = StringParser() self.cursor = self.connection.cursor() self.cursor.execute(member_query) self.members = self.cursor.fetchall() self.cursor.execute(count_query) self.N_members = self.cursor.fetchone()[0] print(self.N_members) # self.members =[12, 50393960, 39247971, 39224224] def __iter__(self): list_dict = Dictionary.load('terms.dict') # list_dict.filter_extremes(no_below=1000,no_above=0.99) counter = 0 doc_id = 0 for member_id, count in self.members: if counter % 100 == 0: print('Done', counter) self.cursor.execute(self.query, (member_id, )) expert_text = Counter() for result in self.cursor: parsed_text = self.parser.parse_list(title=result[1], description=result[2]) expert_text.update(parsed_text['text']) terms = sorted([(e, v) for e, v in expert_text.items() if v > 1], key=operator.itemgetter(1), reverse=True) counter += 1 if len(terms): if terms[0][1] > 10: word_bag = [] for k, v in terms: try: word_bag.append((list_dict.token2id[k], v)) except KeyError: pass expert2doc[member_id] = doc_id doc_id += 1 yield word_bag
class ExpertCorpus(object): def __init__(self): member_query = """ select * from expert_420 """ count_query = """ select count(*) from expert_420 """ self.query = """ select l.list_id,l.list_name,l.list_description from list_rec_420 as l join list_member_rec_420 as lm1 on lm1.list_id=l.list_id where lm1.member_id = %s; """ conn_string = "dbname='list_6220' user = '******' password =''" self.connection = connect(conn_string) self.ind = 0 self.parser = StringParser() self.cursor = self.connection.cursor() self.cursor.execute(member_query) self.members = self.cursor.fetchall() self.cursor.execute(count_query) self.N_members = self.cursor.fetchone()[0] print(self.N_members) # self.members =[50393960, 39247971, 39224224] def __iter__(self): list_dict = Dictionary.load('terms.dict') # list_dict.filter_extremes(no_below=1000,no_above=0.99) counter = 0 doc_id = 0 for member_id, count in self.members: if counter % 1000 == 0: print('Done', counter) print(member_id, count) self.cursor.execute(self.query, (member_id, )) expert_text = Counter() for result in self.cursor: parsed_text = self.parser.parse_list(title=result[1], description=result[2]) expert_text.update(parsed_text['text']) # expert_text.update(parsed_text['bigrams']) terms = ((e, v) for e, v in expert_text.items() if v > 10 and any([e.startswith(t) for t in topics])) counter += 1 print(list(terms)) word_bag = [] for k, v in terms: try: word_bag.append((list_dict.token2id[k], v)) except KeyError: pass expert2doc[member_id] = doc_id doc_id += 1 yield word_bag
import csv import string from nltk import FreqDist import nltk from parsers.string_parser import StringParser, latin_letters import psycopg2 __author__ = 'Katharine' conn_string = "dbname='nlstudent' user = '******' password ='******'" conn = psycopg2.connect(conn_string) cursor = conn.cursor() f = StringParser() s = {} get_listinfo_for_list = """SELECT l.list_name,l.list_description FROM list_rec as l where list_id = %s""" def get_list_dists_for(list_id): cursor.execute(get_listinfo_for_list, [list_id]) rows = cursor.fetchall() tstrout = '' for row in rows: c_line = str(row) c_line = ''.join(filter(lambda x: x in string.printable, c_line)) # print(c_line) if len(c_line): s = f.parse(c_line, True)
class _ExpertCorpus(object): def __init__(self): member_query = """ select * from expert_10_50 """ count_query = """ select count(*) from expert_10_50 """ self.query = """ select l.list_id,l.list_name,l.list_description from list_rec as l join list_member_rec as lm1 on lm1.list_id=l.list_id where lm1.member_id = %s; """ conn_string = "dbname='nlstudent' user = '******' password ='******'" self.connection = connect(conn_string) self.ind = 0 self.parser = StringParser() self.cursor = self.connection.cursor() self.cursor.execute(member_query) self.members = self.cursor.fetchall() self.cursor.execute(count_query) self.N_members = self.cursor.fetchone()[0] print(self.N_members) # self.members =[12, 50393960, 39247971, 39224224] def __iter__(self): list_dict = Dictionary.load('terms.dict') # list_dict.filter_extremes(no_below=1000,no_above=0.99) counter = 0 doc_id = 0 for member_id, count in self.members: if counter % 100 == 0: print('Done', counter) self.cursor.execute(self.query, (member_id,)) expert_text = Counter() for result in self.cursor: parsed_text = self.parser.parse_list(title=result[1], description=result[2]) expert_text.update(parsed_text['text']) terms = sorted([(e, v) for e, v in expert_text.items() if v > 1], key=operator.itemgetter(1), reverse=True) counter += 1 if len(terms): if terms[0][1] > 10: word_bag = [] for k, v in terms: try: word_bag.append((list_dict.token2id[k], v)) except KeyError: pass expert2doc[member_id] = doc_id doc_id += 1 yield word_bag
class ExpertCorpus(object): def __init__(self): member_query = """ select * from expert_420 """ count_query = """ select count(*) from expert_420 """ self.query = """ select l.list_id,l.list_name,l.list_description from list_rec_420 as l join list_member_rec_420 as lm1 on lm1.list_id=l.list_id where lm1.member_id = %s; """ conn_string = "dbname='list_6220' user = '******' password =''" self.connection = connect(conn_string) self.ind = 0 self.parser = StringParser() self.cursor = self.connection.cursor() self.cursor.execute(member_query) self.members = self.cursor.fetchall() self.cursor.execute(count_query) self.N_members = self.cursor.fetchone()[0] print(self.N_members) # self.members =[50393960, 39247971, 39224224] def __iter__(self): list_dict = Dictionary.load('terms.dict') # list_dict.filter_extremes(no_below=1000,no_above=0.99) counter = 0 doc_id = 0 for member_id, count in self.members: if counter % 1000 == 0: print('Done', counter) print(member_id, count) self.cursor.execute(self.query, (member_id,)) expert_text = Counter() for result in self.cursor: parsed_text = self.parser.parse_list(title=result[1], description=result[2]) expert_text.update(parsed_text['text']) # expert_text.update(parsed_text['bigrams']) terms = ((e, v) for e, v in expert_text.items() if v > 10 and any([e.startswith(t) for t in topics])) counter += 1 print(list(terms)) word_bag = [] for k, v in terms: try: word_bag.append((list_dict.token2id[k], v)) except KeyError: pass expert2doc[member_id] = doc_id doc_id += 1 yield word_bag
import string from nltk import FreqDist import nltk from psycopg2._psycopg import DatabaseError import sys from parsers.string_parser import StringParser, latin_letters import psycopg2 __author__ = 'Katharine' conn_string = "dbname='nlstudent' user = '******' password ='******'" conn = psycopg2.connect(conn_string) cursor = conn.cursor() f = StringParser() s = {} get_listinfo_for_member = """SELECT l.list_name,l.list_description FROM list_rec as l JOIN list_member_rec as lm ON l.list_id = lm.list_id where lm.member_id = %s""" # get_listcount_for_member = """SELECT count(lm.list_id) FROM list_member_rec as lm JOIN # list_rec as l ON l.list_id = lm.list_id # where lm.member_id = %s;""" """NB relies on table member_list_count_rec containing listcount for members """ get_listcount_for_members = """SELECT member_id,count FROM member_list_count_rec limit 1 offset 101;""" get_listinfo_for_list = """SELECT l.list_name,l.list_description FROM list_rec as l