示例#1
0
文件: EvalCACM.py 项目: Benlog/RI
 def __init__(self, req, jug):
     self.pars = ParserCACM()
     self.pars.initFile(req)
     self.jug = pd.read_csv(
         jug,
         delim_whitespace=True,
         header=None,
         index_col=False,
         names=['queryId', 'docId', 'sub-theme', 'score'],
         dtype={
             'queryId': str,
             'docId': str
         },
         converters={'queryId': lambda x: str(int(x))})
    def indexation(self):
## Creat docs and stems docFrom
        self.docs=CreatDocs()
        self.stems=CreatTerms()
        self.docFrom=CreatdocFrom()
        self.parser=ParserCACM()
        self.texRepresenter=PorterStemmer() 
        self.linkin,self.linkout=createlink()
        return self
示例#3
0
class QueryParser(object):
    """Class for query reading from file"""
    def __init__(self, query_file, relevance_file):
        self.query = open(query_file, 'r')
        self.textRepresenter = PorterStemmer()

        #init boolean to be able to close source files
        self.already_closed = False

        #Create parser to read query_file
        #WARNING WILL ONLY WORK ON CACM DATASET TODO FIND SOLUTION
        self.parser = ParserCACM()
        self.parser.initFile(query_file)

        #Build a dictionary (query_id, list of relevant documents)
        self.relevant_docs = {}
        with open(relevance_file, 'r') as f:
            for line in f:
                data = line.split(" ")
                query_id = int(data[0])
                if (not self.relevant_docs.has_key(query_id)):
                    self.relevant_docs[query_id] = []
                #A list is added per relevant doc for later use of couple (themes, score)
                self.relevant_docs.get(query_id).append(
                    [int(data[1]), None, None])

    def nextQuery(self):
        """Return next Query object"""

        query_data = self.parser.nextDocument()

        if (query_data == None):
            if (not self.already_closed):
                self.query.close()
                self.already_closed = True
                return -1

        query_id = query_data.getId()
        query_text = query_data.getText()
        query_tf = self.textRepresenter.getTextRepresentation(query_text)
        relevant_docs_to_query = np.array(
            self.relevant_docs.get(int(query_id), [[None, None, None]]))
        return Query(query_id, query_text, query_tf, relevant_docs_to_query)
示例#4
0
文件: EvalCACM.py 项目: Benlog/RI
class QueryParserCACM(QueryParser):
    def __init__(self, req, jug):
        self.pars = ParserCACM()
        self.pars.initFile(req)
        self.jug = pd.read_csv(
            jug,
            delim_whitespace=True,
            header=None,
            index_col=False,
            names=['queryId', 'docId', 'sub-theme', 'score'],
            dtype={
                'queryId': str,
                'docId': str
            },
            converters={'queryId': lambda x: str(int(x))})

    def nextQuery(self):
        '''
            :return: return a dictionary {'id' : id of the query, 'text' : text of the querry, 'revelent' : {docId : (sub-theme, score)}}
        '''
        return self.__next__()

    def __next__(self):
        q = self.pars.nextDocument()
        if q:
            return {
                'id':
                q.identifier,
                'text':
                q.text,
                'revelent':
                self.jug[self.jug.queryId == q.getId()].drop(
                    'queryId', axis=1).set_index('docId').to_dict('index')
            }
        raise StopIteration()

    def __iter__(self):
        return self
示例#5
0
    def __init__(self, query_file, relevance_file):
        self.query = open(query_file, 'r')
        self.textRepresenter = PorterStemmer()

        #init boolean to be able to close source files
        self.already_closed = False

        #Create parser to read query_file
        #WARNING WILL ONLY WORK ON CACM DATASET TODO FIND SOLUTION
        self.parser = ParserCACM()
        self.parser.initFile(query_file)

        #Build a dictionary (query_id, list of relevant documents)
        self.relevant_docs = {}
        with open(relevance_file, 'r') as f:
            for line in f:
                data = line.split(" ")
                query_id = int(data[0])
                if (not self.relevant_docs.has_key(query_id)):
                    self.relevant_docs[query_id] = []
                #A list is added per relevant doc for later use of couple (themes, score)
                self.relevant_docs.get(query_id).append(
                    [int(data[1]), None, None])
示例#6
0
文件: main.py 项目: kalifou/ri_tme1
def test_weighter():
    parser = ParserCACM()
    textRepresenter = PorterStemmer()
    fname = "data/cacm/cacm.txt"
    I = Index(parser, textRepresenter)
    I.indexation(fname)
    weighters = [Binary(I), TF(I), TF_IDF(I), Log(I), Log_plus(I)]
    for i, w in enumerate(weighters):
        print "Test of weighter" + str(i)
        print "getDocWeightsForDoc"
        print w.getDocWeightsForDoc("20")
        print "getDocWeightsForStem"
        print w.getDocWeightsForStem("accelerat")
        print "getDocWeightsForQuery"
        print w.getWeightsForQuery(I.getTfsForDoc("20"))
示例#7
0
def initIndex(database_file):
    """Init Index or load it if previously computed"""
    sys.stdout.write("Indexing database...")
    sys.stdout.flush()
    if os.path.isfile('Index.p'):
       I = pickle.load( open( "Index.p", "rb" ) ) 
    
    else:
        parser = ParserCACM()
        textRepresenter = PorterStemmer()
        I = Index(parser,textRepresenter)
        I.indexation(database_file)
        I.parser = None
        pickle.dump( I, open( "Index.p", "wb" ) )
        
    sys.stdout.write("Done!\n")
    sys.stdout.flush()
    
    return I
def createlink():
    linkout={}
    linkin={}
    ps=ParserCACM()
    ps.initFile("cacm/cacm.txt")
    doc=ps.nextDocument()
    while doc:
        linkout[doc.getId()]=set()
        links=doc.get('links').split(';')
        for link in links:
            if link!='':
                linkout[doc.getId()].add(link)
                if link not in linkin:
                    linkin[link]=set()    # value unique            
                linkin[link].add(doc.getId())                             
        doc=ps.nextDocument() 
    return linkin,linkout
示例#9
0
文件: main.py 项目: kalifou/ri_tme1
        print "get top 3 documents = ", '[%s]' % ', '.join(
            map(str, query_results[i][0:3]))
    return query_results


if __name__ == "__main__":

    fname = "data/cacm/cacm.txt"

    sys.stdout.write("Indexing database...")
    sys.stdout.flush()
    if os.path.isfile('Index.p'):
        I = pickle.load(open("Index.p", "rb"))

    else:
        parser = ParserCACM()
        textRepresenter = PorterStemmer()
        I = Index(parser, textRepresenter)
        I.indexation(fname)
        I.parser = None
        pickle.dump(I, open("Index.p", "wb"))

    sys.stdout.write("Done!\n")
    sys.stdout.flush()

    sys.stdout.write("Creating weighters...")
    sys.stdout.flush()

    if os.path.isfile('Vectoriel.p'):
        models = pickle.load(open("Models.p", "rb"))
    else:
示例#10
0
	def __init__(self):
		ParserCACM.__init__(self)
示例#11
0
        with open(self.stems_filename, 'w') as f:
            json.dump(stems, f)
        with open(self.docs_filename, 'w') as f:
            json.dump(docs, f)

        return

    def string2json(self, string):
        '''convert "{'4': 1}{'7': 10}" to {'4':1, '7':10} '''
        string_list = [s + '}' for s in string.split('}')][:-1]
        json_list = [ast.literal_eval(s) for s in string_list]
        return {d.keys()[0]: d.values()[0] for d in json_list}


if __name__ == '__main__':
    parser = ParserCACM()
    parser.initFile("cacm/cacm.txt")

    index = Index(name='test',
                  parser=ParserCACM,
                  textRepresenter=PorterStemmer,
                  create_index=False)

    for i in range(20):
        doc = parser.nextDocument()
    doc_id = doc.getId()
    doc_rep = PorterStemmer().getTextRepresentation(doc.getText())

    print('doc_rep', doc_rep)
    print('length', index.getDocsLength(doc_id))
示例#12
0
文件: Index.py 项目: gozuslayer/RI
		index_file_inverse.seek(self.index_inverse[stem][0])
		doc_tf = index_file_inverse.read(self.index_inverse[stem][1])
		index_file_inverse.flush()
		index_file_inverse.close()
		return doc_tf

	def getStrDoc(self, id_doc):
		f = open(self.docFrom[id_doc][0], 'r')
		f.seek(self.docFrom[id_doc][1])
		doc = f.read(self.docFrom[id_doc][2])
		f.flush()
		f.close()
		return doc


	def _dict_to_file(self, dict):
		return ''.join([self._line_to_file(i, dict[i]) for i in dict.keys()])

	def _line_to_file(self, i, v):
		return str(i) + '|' + str(v) + ' '


tr = PorterStemmer()
parser = ParserCACM()
parser.initFile("C:\Users\king\Desktop\DAC2\RI\TP1-6\RI\cacm\cacm.txt")
doc = parser.nextDocument()
print type(doc.others["links"])


 
示例#13
0
from ParserCACM import ParserCACM, QueryParser
from TextRepresenter import PorterStemmer


if __name__=='__main__':

    rel_filename = 'cacm/cacm.rel'
    query_filename = 'cacm/cacm.qry'

    index = Index(name='test', docFrom=None,
                  parser=ParserCACM, textRepresenter=PorterStemmer,
                  create_index=False)


    weighter = Weighter(index)
    parser = ParserCACM()
    parser.initFile('cacm/cacm.txt')
    doc = parser.nextDocument()
    print(doc.others['links'])
    # for d in range(20,22):
    #     docId = str(d)
    #     print(ParserCACM().getDocument(docId))
        # print(weighter.getDocWeightsForDoc(docId), index.getDocsLength(docId))

    # q = QueryParser(query_filename, rel_filename)
    # train_queries, test_queries = q.split_query_dataset()
    # print(len(train_queries), len(test_queries))