def collectInfo (): morph = pymorphy2.MorphAnalyzer() titles = session.query(distinct(HseArticle.title)).all() print "len of distinct(titles) : " + str(len(titles)) titles_list = [x[0] for x in titles] titles_dic = {} isAuthor = re.compile('^Author:\s*', re.IGNORECASE) isPubList = False isPub = re.compile('^\thttp://publications.hse.ru/view/.*', re.IGNORECASE) fileArr = open("logfile2.txt","r").readlines() author_name = "" for line in fileArr: if isAuthor.match(line): lineArr = line.split(":")[-1].split() #print lineArr lineArr = line.split() authorUri = lineArr[-1] author_name = ' '.join(lineArr[1:4]) author_name = UnicodeDammit(author_name).unicode_markup # print author_name # break # for l in lineArr: # if len(l) > 3: # print l.lower() # author_name = l.lower() # break if author_name == "": isPubList = False continue else: isPubList = True elif isPubList: if isPub.match(line): print line.strip() + ' ' + author_name pub = line.strip() if author_name != "": if pub not in titles_dic.keys(): titles_dic[pub] = [] if author_name not in titles_dic[pub]: titles_dic[pub].append(authorUri) # print UnicodeDammit(author_name).unicode_markup result_list = [] authors = [] for uri in titles_dic.keys(): #print uri collected_info = [] article = session.query(HseArticle)\ .filter(HseArticle.uri == uri)\ .first() stop_string = ":.-()!,[]'\"|" abstr_list = [] for x in article.abstr.split(): if x not in words: x = x.strip(stop_string).lower() abstr_list.append(morph.parse(UnicodeDammit(x).unicode_markup)[0].normal_form) keyword_list = [] for x in article.keyword.split(";"): for y in x.split(" "): if y not in words: y = y.strip(stop_string).lower() keyword_list.append(morph.parse(UnicodeDammit(y).unicode_markup)[0].normal_form) title_list = [] for x in article.title.split(): if x not in words: x = x.strip(stop_string).lower() title_list.append(morph.parse(UnicodeDammit(x).unicode_markup)[0].normal_form) elib_list = [] for x in article.elib.split(): if x not in words: x = x.strip(stop_string).lower() elib_list.append(morph.parse(UnicodeDammit(x).unicode_markup)[0].normal_form) interest_list = [] for x in article.interest.split(): if x not in words: x = x.strip(stop_string).lower() interest_list.append(morph.parse(UnicodeDammit(x).unicode_markup)[0].normal_form) author_list = [] for x in article.authors.split(): if x not in words: x = x.strip(stop_string).lower() author_list.append(x) # author_list.extend(titles_dic[uri]) # session.commit() # article.authors = " ".join(titles_dic[uri]) # session.commit() collected_info.extend(abstr_list) collected_info.extend(keyword_list) collected_info.extend(title_list) collected_info.extend(elib_list) collected_info.extend(interest_list) collected_info.extend(author_list) authors.extend(author_list) result_list.append(collected_info) return result_list,authors
def collectInfo (): words = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "0", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "а","б","в","г","д","е", "ё","ж","з","и","л","м","н","о", "п","р","с","т","у","ф","х","ц","ш","щ","ъ","ь","э","ю","я", "большой", "бы", "быть", "в", "весь", "вот", "все", "всей", "вы", "говорить", "год", "да", "для", "до", "еще", "же", "знать", "и", "из", "к", "как", "который", "мочь", "мы", "на", "наш", "не", "него", "нее", "нет", "них", "но", "о", "один", "она", "они", "оно", "оный", "от", "ото", "по", "с", "свой", "себя", "сказать", "та", "такой", "только", "тот", "ты", "у", "что", "это", "этот", "я", "без", "более", "больше", "будет", "будто", "бы", "был", "была", "были", "было", "быть", "вам", "вас", "ведь", "весь", "вдоль", "вдруг", "вместо", "вне", "вниз", "внизу", "внутри", "во", "вокруг", "вот", "впрочем", "все", "всегда", "всего", "всех", "всю", "вы", "где", "да", "давай", "давать", "даже", "для", "до", "достаточно", "другой", "его", "ему", "ее", "её", "ей", "если", "есть", "ещё", "еще", "же", "за", "за исключением", "здесь", "из", "из-за", "из", "или", "им", "иметь", "иногда", "их", "как-то", "кто", "когда", "кроме", "кто", "куда", "ли", "либо", "между", "меня", "мне", "много", "может", "мое", "моё", "мои", "мой", "мы", "на", "навсегда", "над", "надо", "наконец", "нас", "наш", "не", "него", "неё", "нее", "ней", "нет", "ни", "нибудь", "никогда", "ним", "них", "ничего", "но", "ну", "об", "однако", "он", "она", "они", "оно", "опять", "от", "отчего", "очень", "перед", "по", "под", "после", "потом", "потому", "потому что", "почти", "при", "про", "раз", "разве", "свою", "себя", "сказать", "снова","с", "со", "совсем", "так", "также", "такие", "такой", "там", "те", "тебя", "тем", "теперь", "то", "тогда", "того", "тоже", "той", "только", "том", "тот", "тут", "ты", "уже", "хоть", "хотя", "чего", "чего-то", "чей", "чем", "через", "что", "что-то", "чтоб", "чтобы", "чуть", "чьё", "чья", "эта", "эти", "это", "эту", "этого", "этом", "этот","к"] morph = pymorphy2.MorphAnalyzer() titles = session.query(distinct(HseArticle.title)).all() print "len of distinct(titles) : " + str(len(titles)) titles_list = [x[0] for x in titles] # for x in titles: # print type(x[0]) #print codecs.encode(x[0],'koi8_r') #print codecs.encode(x[0].decode('latin_1').encode('utf-8'),'koi8_r') results = session.query(HseArticle.id, HseArticle.interest,HseArticle.elib,\ HseArticle.keyword, HseArticle.title,\ HseArticle.abstr)\ .filter(HseArticle.title.in_(titles_list))\ .all() print "len of distinct articles(by title) : " + str(len(results)) titles_dic = {} isAuthor = re.compile('^Author:\s*', re.IGNORECASE) isPubList = False isPub = re.compile('^\thttp://publications.hse.ru/view/.*', re.IGNORECASE) fileArr = open("logfile.txt","r").readlines() author_name = "" for line in fileArr: if isAuthor.match(line): lineArr = line.split(":")[-1].split() #print lineArr for l in lineArr: if len(l) > 3: print l.lower() author_name = l.lower() break if author_name == "": isPubList = False continue else: isPubList = True elif isPubList: if isPub.match(line): print line.strip() + ' ' + author_name pub = line.strip() if author_name != "": if pub not in titles_dic.keys(): titles_dic[pub] = [] if author_name not in titles_dic[pub]: titles_dic[pub].append(UnicodeDammit(author_name).unicode_markup) result_list = [] authors = [] for uri in titles_dic.keys(): #print uri collected_info = [] artice = session.query(HseArticle)\ .filter(HseArticle.uri == uri)\ .first() stop_string = ":.-()!,[]'\"|" abstr_list = [] for x in artice.abstr.split(): if x not in words: x = x.strip(stop_string).lower() abstr_list.append(morph.parse(UnicodeDammit(x).unicode_markup)[0].normal_form) keyword_list = [] for x in artice.keyword.split(";"): for y in x.split(" "): if y not in words: y = y.strip(stop_string).lower() keyword_list.append(morph.parse(UnicodeDammit(y).unicode_markup)[0].normal_form) title_list = [] for x in artice.title.split(): if x not in words: x = x.strip(stop_string).lower() title_list.append(morph.parse(UnicodeDammit(x).unicode_markup)[0].normal_form) elib_list = [] for x in artice.elib.split(): if x not in words: x = x.strip(stop_string).lower() elib_list.append(morph.parse(UnicodeDammit(x).unicode_markup)[0].normal_form) interest_list = [] for x in artice.interest.split(): if x not in words: x = x.strip(stop_string).lower() interest_list.append(morph.parse(UnicodeDammit(x).unicode_markup)[0].normal_form) author_list = [] author_list.extend(titles_dic[uri]) collected_info.extend(abstr_list) collected_info.extend(keyword_list) collected_info.extend(title_list) collected_info.extend(elib_list) collected_info.extend(interest_list) collected_info.extend(author_list) authors.extend(author_list) result_list.append(collected_info) return result_list,authors
def collectInfo2 (): morph = pymorphy2.MorphAnalyzer() titles = session.query(distinct(HseArticle.title)).all() d = shelve.open("authors.list") #titles_list = [x[0] for x in titles] # titles_dic = {} # isAuthor = re.compile('^Author:\s*', re.IGNORECASE) # isPubList = False # isPub = re.compile('^\thttp://publications.hse.ru/view/.*', re.IGNORECASE) # fileArr = open("logfile2.txt","r").readlines() # author_name = "" # for line in fileArr: # if isAuthor.match(line): # lineArr = line.split(":")[-1].split() # #print lineArr # lineArr = line.split() # authorUri = lineArr[-1] # author_name = ' '.join(lineArr[1:4]) # author_name = UnicodeDammit(author_name).unicode_markup # if author_name == "": # isPubList = False # continue # else: # isPubList = True # elif isPubList: # if isPub.match(line): # # print line.strip() + ' ' + author_name # pub = line.strip() # if author_name != "": # if pub not in titles_dic.keys(): # titles_dic[pub] = [] # if author_name not in titles_dic[pub]: # titles_dic[pub].append(authorUri) # # print UnicodeDammit(author_name).unicode_markup # print len(titles_dic.keys()) titles_dic = {} for authUri in d["authorUri2paper"].keys(): for paperUri in d["authorUri2paper"][authUri]: if paperUri not in titles_dic.keys(): titles_dic[paperUri] = [] titles_dic[paperUri].append(authUri) else: if authUri not in titles_dic[paperUri]: titles_dic[paperUri].append(authUri) # print len(titles_dic.keys()) result_list = [] # authors = [] db = MySQLdb.connect(host="localhost", user="******", passwd="pass", db="nlp", charset='utf8') cursor = db.cursor() author_list = set([]) for uri in titles_dic.keys(): collected_info = [] sql = """SELECT abstr,keyword,title,elib,interest,authors FROM hse_article WHERE uri = "{}" """.format(uri) cursor.execute(sql) article = cursor.fetchall() for column in article: # print column[5].split() for a in column[5].split(): author_list.add(a) # print re.split(';|,|\)|\(|"|\]|\[| |',column[5]) for word in column: w = re.split(';|,|\)|\(|"|\]|\[| |',word) for i in w: if i == u"": continue i = morph.parse(i)[0].normal_form i = i.lower() if i not in words: # isPub = re.compile('^http://www.hse.ru/org/.*', re.IGNORECASE) # if isPub.match(i): # print i collected_info.append(i) # else: # print "Is common word!" # print collected_info # break result_list.append(collected_info) db.close() print author_list print len(author_list) print result_list return result_list
# -*- coding: koi8-r -*- from nlp_models import session, HseArticle import urllib2 import lxml.html import re from bs4 import UnicodeDammit import shelve ##Request from DB all hse_articles pub_dic = {} article = session.query(HseArticle).all() for i in article: if i not in pub_dic.keys(): pub_dic[i.uri] = i.id author2paper = {} author2uri = {} isEn = re.compile(".*[a-zA-Z].*") l = set([]) with open('publications.txt','r') as f: for url in f.readlines(): l.add(url) print len(l) j = 1 for pub in l: pub = pub.decode('utf-8','ignore').rstrip() print str(j) + " Scanning {}. . . ".format(pub) try: content = urllib2.urlopen(pub.encode('utf-8')).read().decode('utf-8','ignore') doc = lxml.html.document_fromstring(content)